From 4146bcbcc64a5a04462a0dc64d998a476f5fc9e3 Mon Sep 17 00:00:00 2001 From: Ray Andrew Date: Mon, 18 May 2026 00:50:54 -0500 Subject: [PATCH] feat(perf): performance improvements for parallel reading, indexing, and aggregation Indexer - Streaming parse-and-emit worker pipeline with bounded memory usage - Concurrent SST artifact ingestion with staging support - Gzip member slicing for parallel indexing - Lazy decoding for compressed value counts - Bypass DOM wrapper for indexer hot path (simdjson on_demand) - Decoupled write workers from parse workers - --rebuild-summaries flag and optimized root summary rebuild Aggregator / MPI - Task-based DAG execution for aggregator pipeline - Shared staging for multi-node artifact relocation - Per-node thread scaling to avoid oversubscription - Unified distributed aggregation tracking, removed manifest consolidation - Deterministic aggregation and intra-file parallelism Trace reader / query - Compiled predicate evaluation for AND-of-EQ queries - Uniform-match shortcut for AND-of-EQ queries - Line-range support for work items and checkpoint processing - Optimized chunk pruning and checkpoint handling Replay - Pipelined replay with coroutines and channels - JsonParser-based trace processing - Optimized string handling and i/o buffering Organize / writer / dft - Parallel slice creation and merging in organize visitor - Inline indexer in organize - Gzip member tracking in writer - Coroutine-based event dispatcher with extracted parse logic - Batch flushing in organize visitor Arrow / call_tree - Optimized arrow conversion - Arrow IPC support and improved save/load in call_tree Build / infrastructure - zlib-ng option, system simdjson fallback - cgroup v1/v2 memory limit detection - Auto-computed per-file memory estimates and batch sizes - CI: perf branch trigger, formatting Docs - Rewritten indexer and trace reader API references --- .envrc | 1 - .github/workflows/ci.yml | 173 +- .gitignore | 1 + CMakeLists.txt | 15 + CMakePresets.json | 3 +- .../dftracer_utils_config.dbg.h.in | 7 + .../dftracer_utils_config.h.in | 7 + cmake/modules/Dependencies.cmake | 464 ++- cmake/modules/InstallHelpers.cmake | 243 +- cmake/modules/PrecompiledHeader.cmake | 15 +- docs/scripts/generate_api_index.py | 56 +- docs/source/api/indexer.rst | 72 +- docs/source/api/reader.rst | 26 +- docs/source/api/runtime.rst | 8 +- docs/source/api/trace_reader.rst | 90 +- docs/source/api/utilities.rst | 23 +- docs/source/call-tree.rst | 98 +- docs/source/cli.rst | 162 + docs/source/conf.py | 1255 ++++-- docs/source/cpp_api/arrow.rst | 45 +- docs/source/cpp_api/coro.rst | 10 + docs/source/cpp_api/dft_aggregators.rst | 32 +- docs/source/cpp_api/indexer.rst | 62 +- docs/source/cpp_api/io.rst | 33 + docs/source/cpp_api/pipeline/executors.rst | 3 - docs/source/cpp_api/reader.rst | 29 +- docs/source/cpp_api/rocksdb.rst | 7 +- docs/source/developers.rst | 32 +- docs/source/installation.rst | 61 +- docs/source/pipeline.rst | 85 + docs/source/quickstart.rst | 4 + docs/source/server.rst | 11 + docs/source/utilities/common.rst | 112 +- docs/source/utilities/composites.rst | 45 +- docs/source/utilities/compression.rst | 27 + docs/source/utilities/fileio.rst | 62 + docs/source/utilities/indexer.rst | 85 +- docs/source/utilities/reader.rst | 34 +- docs/source/utilities/replay.rst | 12 + examples/call_tree_example1.cpp | 55 +- examples/call_tree_example2.cpp | 52 +- examples/call_tree_example3.cpp | 34 +- flake.lock | 27 - flake.nix | 54 - include/dftracer/utils/call_tree/call_tree.h | 65 +- .../dftracer/utils/call_tree/call_tree_mpi.h | 37 +- .../utils/call_tree/internal/call_tree.h | 6 + .../utils/call_tree/internal/factory.h | 13 +- .../dftracer/utils/call_tree/internal/node.h | 72 +- .../utils/call_tree/internal/trace_reader.h | 74 +- .../utils/call_tree/json_serializer.h | 5 +- .../dftracer/utils/call_tree/mpi/build_task.h | 33 - .../dftracer/utils/call_tree/mpi/builder.h | 187 +- .../utils/call_tree/mpi/file_header.h | 43 - .../utils/call_tree/mpi/filtered_reader.h | 64 - .../utils/call_tree/mpi/pid_index_info.h | 37 - .../utils/call_tree/mpi/serializable.h | 73 +- .../utils/call_tree/mpi/serialization.h | 31 - .../dftracer/utils/core/common/buffer_pool.h | 30 +- .../dftracer/utils/core/common/constants.h | 3 - .../utils/core/common/memory_budget.h | 32 + .../dftracer/utils/core/common/object_pool.h | 92 +- .../utils/core/common/string_intern.h | 265 +- .../core/common/transparent_string_hash.h | 33 +- include/dftracer/utils/core/coro/channel.h | 111 + .../dftracer/utils/core/pipeline/executor.h | 33 +- .../utils/core/pipeline/pipeline_config.h | 11 +- include/dftracer/utils/core/rocksdb/async.h | 130 - .../utils/core/rocksdb/column_families.h | 65 + .../dftracer/utils/core/rocksdb/database.h | 37 +- .../dftracer/utils/core/rocksdb/db_manager.h | 3 +- include/dftracer/utils/core/runtime.h | 66 + .../dftracer/utils/core/tasks/coro_scope.h | 44 +- .../utils/core/utilities/streaming_utility.h | 21 + .../dftracer/utils/core/utilities/utility.h | 30 + include/dftracer/utils/server/trace_index.h | 9 - .../utils/utilities/common/arrow/arrow.h | 4 + .../utilities/common/arrow/arrow_export.h | 1 + .../utilities/common/arrow/column_builder.h | 74 +- .../utils/utilities/common/arrow/ipc_reader.h | 99 + .../utils/utilities/common/arrow/ipc_writer.h | 103 +- .../utilities/common/arrow/parallel_reader.h | 95 + .../utilities/common/arrow/partition_router.h | 94 + .../utilities/common/arrow/partition_writer.h | 76 + .../utils/utilities/common/json/json.h | 10 +- .../utilities/common/json/json_doc_guard.h | 65 +- .../utils/utilities/common/json/json_value.h | 182 +- .../utils/utilities/common/json/parser.h | 241 ++ .../utils/utilities/common/query/ast.h | 5 + .../utils/utilities/common/query/evaluator.h | 6 +- .../utils/utilities/common/query/query.h | 11 +- .../common/serialization/binary_codec.h | 210 + .../common/statistics/log2_histogram.h | 7 - .../common/statistics/timestamp_histogram.h | 59 + .../aggregators/aggregation_augmentation.h | 33 + .../dft/aggregators/aggregation_config.h | 46 + .../dft/aggregators/aggregation_logic.h | 28 + .../aggregators/aggregation_merge_operator.h | 26 + .../dft/aggregators/aggregation_metrics.h | 23 +- .../dft/aggregators/aggregation_output.h | 11 +- .../aggregators/aggregation_serialization.h | 356 ++ .../dft/aggregators/aggregation_visitor.h | 130 + .../aggregators/aggregator_summary_utility.h | 2 +- .../dft/aggregators/aggregator_types.h | 164 + .../dft/aggregators/aggregator_utility.h | 34 +- .../composites/dft/aggregators/aggregators.h | 2 +- .../association_resolver_utility.h | 11 +- .../dft/aggregators/association_tracker.h | 13 +- .../aggregators/chunk_aggregator_utility.h | 23 +- .../dft/aggregators/event_aggregator.h | 132 + .../aggregators/event_aggregator_utility.h | 36 - .../perfetto_trace_writer_utility.h | 54 +- .../dft/aggregators/system_metrics.h | 206 + .../system_metrics_merge_operator.h | 26 + .../system_metrics_serialization.h | 37 + .../utils/utilities/composites/dft/args_map.h | 216 ++ .../dft/comparator/comparison_config.h | 10 +- .../dft/comparator/comparison_result.h | 2 +- .../dft/comparator/comparison_utility.h | 6 +- .../composites/dft/dft_event_dispatcher.h | 326 ++ .../composites/dft/dft_event_visitor.h | 70 + .../utils/utilities/composites/dft/event.h | 207 +- .../composites/dft/indexing/bloom_filter.h | 28 +- .../dft/indexing/chunk_dimension_stats.h | 84 +- .../dft/indexing/chunk_indexer_utility.h | 7 +- .../dft/indexing/chunk_pruner_utility.h | 32 + .../dft/indexing/chunk_statistics.h | 32 +- .../dft/indexing/index_resolver_utility.h | 86 + .../dft/indexing/resolve_and_build.h | 43 + .../utilities/composites/dft/internal/utils.h | 10 +- .../utilities/composites/dft/parse_inflated.h | 108 + .../dft/reorganize/group_writer_task.h | 69 + .../dft/reorganize/manifest_extractor.h | 38 + .../dft/reorganize/organize_visitor.h | 106 + .../dft/reorganize/reconstructor_utility.h | 54 + .../dft/reorganize/reorganization_planner.h | 4 +- .../dft/statistics/detailed_statistics.h | 7 +- .../shared_index_statistics_reader.h | 156 + .../statistics_aggregator_utility.h | 9 + .../dft/views/view_reader_utility.h | 5 +- .../composites/dft/visitors/bloom_visitor.h | 148 + .../dft/visitors/hash_table_visitor.h | 57 + .../dft/visitors/manifest_visitor.h | 59 + .../utils/utilities/fileio/chunk_writer.h | 10 + .../utils/utilities/fileio/parallel/layout.h | 59 + .../utils/utilities/fileio/parallel/merge.h | 20 + .../fileio/parallel/parallel_writer.h | 92 + .../filesystem/directory_scanner_utility.h | 48 +- .../pattern_directory_scanner_utility.h | 32 +- .../utils/utilities/filesystem/types.h | 17 +- .../utilities/hash/fnv1a_hasher_utility.h | 54 +- .../utils/utilities/indexer/file_partition.h | 53 + .../utilities/indexer/index_batch_sink.h | 155 + .../utilities/indexer/index_builder_utility.h | 134 +- .../utils/utilities/indexer/index_database.h | 325 +- .../index_database_sst_writer_context.h | 303 ++ .../indexer/index_database_writer_context.h | 184 + .../indexer/index_file_entry_capability.h | 39 + .../utils/utilities/indexer/index_types.h | 81 + .../utils/utilities/indexer/index_visitor.h | 47 +- .../indexer/internal/index_encoding.h | 133 + .../indexer/internal/payload_codec.h | 140 + .../indexer/internal/statistics_codec.h | 26 + .../utilities/indexer/provenance_database.h | 4 +- .../indexer/visitors/bloom_visitor.h | 66 - .../indexer/visitors/manifest_visitor.h | 37 - .../utilities/reader/internal/stream_config.h | 8 + .../utils/utilities/reader/trace_reader.h | 53 +- .../dftracer/utils/utilities/replay/replay.h | 95 +- .../dftracer/utils/utilities/replay/trace.h | 19 +- pyproject.toml | 14 + python/dftracer/utils/__init__.py | 18 +- python/dftracer/utils/arrow.py | 319 +- python/dftracer/utils/dask.py | 1143 +++++- python/dftracer/utils/dftracer_utils_ext.pyi | 779 +++- python/dftracer/utils/indexer.py | 371 ++ python/dftracer/utils/runtime.py | 12 +- python/dftracer/utils/trace_reader.py | 413 ++ src/CMakeLists.txt | 151 +- src/dftracer/utils/binaries/common_cli.h | 329 ++ .../utils/binaries/dftracer_aggregator.cpp | 959 ++--- .../binaries/dftracer_aggregator_mpi.cpp | 1199 ++++++ .../utils/binaries/dftracer_call_tree.cpp | 829 ++-- .../utils/binaries/dftracer_call_tree_mpi.cpp | 208 + .../utils/binaries/dftracer_comparator.cpp | 886 +++-- .../utils/binaries/dftracer_event_count.cpp | 381 +- .../binaries/dftracer_gen_fake_trace.cpp | 465 ++- .../utils/binaries/dftracer_index.cpp | 385 +- src/dftracer/utils/binaries/dftracer_info.cpp | 1032 +++-- .../utils/binaries/dftracer_merge.cpp | 235 +- .../utils/binaries/dftracer_organize.cpp | 1281 +++++-- .../utils/binaries/dftracer_pgzip.cpp | 209 +- .../utils/binaries/dftracer_reconstruct.cpp | 415 +- .../utils/binaries/dftracer_replay.cpp | 1014 ++--- .../utils/binaries/dftracer_server.cpp | 133 +- .../utils/binaries/dftracer_split.cpp | 320 +- .../utils/binaries/dftracer_stats.cpp | 1821 +++++---- src/dftracer/utils/binaries/dftracer_view.cpp | 307 +- src/dftracer/utils/core/common/inflater.h | 31 +- .../utils/core/common/memory_budget.cpp | 206 + .../utils/core/io/io_backend_factory.cpp | 35 +- src/dftracer/utils/core/pipeline/executor.cpp | 72 +- src/dftracer/utils/core/pipeline/pipeline.cpp | 2 - src/dftracer/utils/core/rocksdb/async.cpp | 32 - src/dftracer/utils/core/rocksdb/database.cpp | 122 +- .../utils/core/rocksdb/db_manager.cpp | 13 +- .../utils/core/rocksdb/filesystem.cpp | 6 +- src/dftracer/utils/core/runtime.cpp | 12 + src/dftracer/utils/core/utils/timer.cpp | 39 + src/dftracer/utils/core/utils/timer.h | 35 +- src/dftracer/utils/python/arrow_helpers.cpp | 26 + src/dftracer/utils/python/arrow_helpers.h | 6 + .../utils/python/arrow_parallel_reader.cpp | 212 + .../utils/python/arrow_parallel_reader.h | 16 + .../utils/python/arrow_stream_capsule.cpp | 323 ++ .../utils/python/arrow_stream_capsule.h | 25 + src/dftracer/utils/python/batch_byte_size.h | 55 + src/dftracer/utils/python/batch_indexer.cpp | 2554 +++++++++++++ src/dftracer/utils/python/batch_indexer.h | 38 + .../utils/python/dftracer_utils_ext.cpp | 28 +- src/dftracer/utils/python/index_database.cpp | 363 ++ src/dftracer/utils/python/index_database.h | 23 + src/dftracer/utils/python/indexer.cpp | 337 +- src/dftracer/utils/python/indexer.h | 7 +- src/dftracer/utils/python/json.cpp | 947 ++--- src/dftracer/utils/python/json.h | 30 +- .../utils/python/memoryview_batch.cpp | 114 + src/dftracer/utils/python/memoryview_batch.h | 54 + src/dftracer/utils/python/runtime.cpp | 32 +- .../utils/python/schema_reconcile.cpp | 351 ++ src/dftracer/utils/python/schema_reconcile.h | 49 + .../utils/python/sst_distribution.cpp | 1182 ++++++ src/dftracer/utils/python/sst_distribution.h | 18 + .../utils/python/streaming_iterator.cpp | 168 + .../utils/python/streaming_iterator.h | 166 + src/dftracer/utils/python/trace_reader.cpp | 3405 ++++++++++++++--- src/dftracer/utils/python/trace_reader.h | 1 - .../utils/python/trace_reader_iterator.cpp | 261 +- .../utils/python/trace_reader_iterator.h | 80 +- .../utils/python/utilities/aggregator.cpp | 676 +++- .../utils/python/utilities/comparator.cpp | 212 +- .../utilities/reorganization_planner.cpp | 23 +- src/dftracer/utils/server/cursor.cpp | 10 +- src/dftracer/utils/server/trace_api.cpp | 247 +- src/dftracer/utils/server/trace_index.cpp | 150 +- src/dftracer/utils/server/viz_api.cpp | 495 +-- .../utils/utilities/call_tree/call_tree.cpp | 422 +- .../call_tree/call_tree_internal.cpp | 570 +-- .../utilities/call_tree/call_tree_mpi.cpp | 1525 ++------ .../call_tree/call_tree_save_arrow.cpp | 391 ++ .../call_tree/call_tree_save_binary.cpp | 429 +++ .../utilities/call_tree/json_serializer.cpp | 193 +- .../utilities/common/arrow/column_builder.cpp | 382 +- .../utilities/common/arrow/ipc_reader.cpp | 355 ++ .../utilities/common/arrow/ipc_writer.cpp | 712 +++- .../common/arrow/parallel_reader.cpp | 111 + .../common/arrow/partition_router.cpp | 623 +++ .../common/arrow/partition_writer.cpp | 207 + .../utilities/common/json/json_value.cpp | 38 +- .../utils/utilities/common/json/parser.cpp | 73 + .../utils/utilities/common/query/ast.cpp | 30 + .../utils/utilities/common/query/query.cpp | 4 +- .../common/statistics/log2_histogram.cpp | 69 +- .../common/statistics/timestamp_histogram.cpp | 173 + .../aggregators/aggregation_augmentation.cpp | 281 ++ .../dft/aggregators/aggregation_logic.cpp | 212 + .../aggregation_merge_operator.cpp | 54 + .../dft/aggregators/aggregation_metrics.cpp | 172 +- .../aggregators/aggregation_serialization.cpp | 453 +++ .../dft/aggregators/aggregation_visitor.cpp | 461 +++ .../aggregator_summary_utility.cpp | 12 +- .../dft/aggregators/aggregator_utility.cpp | 696 +++- .../association_resolver_utility.cpp | 2 +- .../dft/aggregators/association_tracker.cpp | 125 +- .../aggregators/chunk_aggregator_utility.cpp | 241 +- .../dft/aggregators/event_aggregator.cpp | 468 +++ .../aggregators/event_aggregator_utility.cpp | 56 - .../perfetto_trace_writer_utility.cpp | 971 +++-- .../system_metrics_merge_operator.cpp | 54 + .../system_metrics_serialization.cpp | 126 + .../dft/comparator/comparison_config.cpp | 186 +- .../dft/comparator/comparison_result.cpp | 12 +- .../dft/comparator/tree_table_formatter.cpp | 209 +- .../dft/event_collector_utility.cpp | 34 +- .../dft/event_id_extractor_utility.cpp | 39 +- .../composites/dft/indexing/bloom_filter.cpp | 110 +- .../dft/indexing/chunk_dimension_stats.cpp | 86 +- .../dft/indexing/chunk_indexer_utility.cpp | 350 +- .../dft/indexing/chunk_pruner_utility.cpp | 299 +- .../dft/indexing/chunk_statistics.cpp | 241 +- .../dft/indexing/index_resolver_utility.cpp | 324 ++ .../dft/indexing/resolve_and_build.cpp | 214 ++ .../composites/dft/internal/utils.cpp | 13 +- .../dft/reorganize/group_writer_task.cpp | 852 +++++ .../dft/reorganize/manifest_extractor.cpp | 176 + .../dft/reorganize/organize_visitor.cpp | 145 + .../dft/reorganize/provenance_tracker.cpp | 74 +- .../dft/reorganize/reconstructor_utility.cpp | 410 ++ .../dft/reorganize/reorganization_planner.cpp | 175 +- .../chunk_detail_scanner_utility.cpp | 71 +- .../dft/statistics/detailed_statistics.cpp | 122 +- .../shared_index_statistics_reader.cpp | 5 + .../statistics_aggregator_utility.cpp | 184 +- .../statistics/statistics_query_utility.cpp | 69 +- .../dft/statistics/trace_statistics.cpp | 96 +- .../composites/dft/views/view_definition.cpp | 109 +- .../dft/views/view_reader_utility.cpp | 92 +- .../composites/dft/visitors/bloom_visitor.cpp | 652 ++++ .../dft/visitors/hash_table_visitor.cpp | 96 + .../dft/visitors/manifest_visitor.cpp | 128 + .../streaming_file_merger_utility.cpp | 13 +- .../utils/utilities/fileio/chunk_writer.cpp | 64 +- .../utilities/fileio/parallel/layout.cpp | 148 + .../utils/utilities/fileio/parallel/merge.cpp | 83 + .../fileio/parallel/padded_striped_writer.cpp | 328 ++ .../fileio/parallel/sharded_writer.cpp | 135 + .../fileio/parallel/striped_writer.cpp | 147 + .../indexer/index_builder_utility.cpp | 789 +++- .../utilities/indexer/index_database.cpp | 2340 ++++++----- .../index_database_sst_writer_context.cpp | 399 ++ .../indexer/index_database_writer_context.cpp | 1279 +++++++ .../indexer/internal/common/gzip_inflater.h | 61 +- .../internal/common/gzip_member_scanner.h | 107 + .../indexer/internal/gzip/gzip_indexer.cpp | 553 ++- .../indexer/internal/gzip/gzip_indexer.h | 44 + .../utilities/indexer/internal/helpers.cpp | 1 - .../indexer/internal/index_batch_writer.h | 120 + .../indexer/internal/index_encoding.cpp | 309 ++ .../indexer/internal/tar/tar_indexer.cpp | 101 +- .../indexer/internal/transaction_scope.h | 39 +- .../utilities/indexer/provenance_database.cpp | 111 +- .../indexer/visitors/bloom_visitor.cpp | 240 -- .../indexer/visitors/manifest_visitor.cpp | 73 - .../utilities/reader/internal/gzip_reader.cpp | 9 + .../utilities/reader/internal/inflater.h | 2 +- .../internal/streams/gzip_line_byte_stream.h | 43 +- .../reader/internal/streams/line_stream.h | 254 +- .../utils/utilities/reader/trace_reader.cpp | 1472 ++++++- .../utils/utilities/replay/replay.cpp | 612 ++- tests/CMakeLists.txt | 39 +- .../binaries/test_dftracer_aggregator_mpi.cpp | 391 ++ tests/binaries/test_dftracer_call_tree.cpp | 124 + .../binaries/test_dftracer_call_tree_mpi.cpp | 281 ++ tests/binaries/test_dftracer_comparator.cpp | 130 +- .../binaries/test_dftracer_gen_fake_trace.cpp | 4 +- tests/binaries/test_dftracer_organize.cpp | 7 +- tests/binaries/test_dftracer_server.cpp | 2 +- tests/pipeline/test_coro_scope.cpp | 12 +- tests/python/common.py | 106 +- tests/python/test_aggregator.py | 295 +- tests/python/test_dask.py | 88 +- tests/python/test_distributed_manifest.py | 204 + tests/python/test_indexer.py | 826 ++-- tests/python/test_reorganization_planner.py | 21 +- tests/python/test_statistics_aggregator.py | 36 +- tests/python/test_statistics_query.py | 39 +- tests/python/test_trace_reader.py | 101 +- tests/python/test_trace_reader_arrow.py | 262 +- tests/python/test_trace_reader_directory.py | 296 ++ tests/python/test_trace_reader_write_arrow.py | 490 +++ tests/replay/test_replay_fidelity.cpp | 271 ++ tests/utilities/CMakeLists.txt | 44 + .../call_tree/test_call_tree_internal.cpp | 171 +- .../arrow/test_arrow_column_builder.cpp | 1 + .../common/arrow/test_arrow_ipc_reader.cpp | 528 +++ .../common/arrow/test_arrow_ipc_writer.cpp | 320 +- .../utilities/common/query/test_evaluator.cpp | 23 +- tests/utilities/common/query/test_query.cpp | 77 +- .../statistics/test_timestamp_histogram.cpp | 240 ++ .../test_aggregation_augmentation.cpp | 128 + .../aggregators/test_aggregation_metrics.cpp | 119 +- .../test_aggregation_serialization.cpp | 205 + .../aggregators/test_aggregator_utility.cpp | 96 +- .../test_event_aggregator_utility.cpp | 6 +- .../dft/aggregators/test_system_metrics.cpp | 309 ++ .../test_system_metrics_merge_operator.cpp | 183 + .../dft/comparator/test_comparison_result.cpp | 18 +- .../dft/indexing/test_bloom_query.cpp | 51 +- .../dft/indexing/test_chunk_pruner.cpp | 38 +- .../indexing/test_manifest_index_builder.cpp | 8 +- .../dft/indexing/test_manifest_queries.cpp | 83 +- .../test_reconstruct_integration.cpp | 37 +- .../test_reconstruction_planner.cpp | 24 +- .../test_reorganization_planner.cpp | 41 +- .../test_reorganize_integration.cpp | 37 +- .../statistics/test_detailed_statistics.cpp | 90 +- .../statistics/test_statistics_aggregator.cpp | 82 +- .../dft/statistics/test_statistics_query.cpp | 18 +- .../dft/statistics/test_trace_statistics.cpp | 57 +- .../composites/dft/test_index_builder.cpp | 11 +- .../dft/views/test_view_builder.cpp | 74 +- .../utilities/composites/test_file_merger.cpp | 12 +- .../fileio/parallel/test_layout_sizing.cpp | 93 + .../parallel/test_padded_striped_writer.cpp | 219 ++ .../fileio/parallel/test_sharded_writer.cpp | 109 + .../fileio/parallel/test_striped_writer.cpp | 109 + .../utilities/indexer/test_index_builder.cpp | 148 +- .../utilities/indexer/test_index_database.cpp | 281 +- .../indexer/test_provenance_database.cpp | 41 +- .../indexer/test_rocksdb_storage.cpp | 40 +- .../indexer/test_sst_ingest_spike.cpp | 469 +++ tests/utilities/reader/test_trace_reader.cpp | 279 +- 402 files changed, 58918 insertions(+), 17518 deletions(-) delete mode 100644 .envrc delete mode 100644 flake.lock delete mode 100644 flake.nix delete mode 100644 include/dftracer/utils/call_tree/mpi/build_task.h delete mode 100644 include/dftracer/utils/call_tree/mpi/file_header.h delete mode 100644 include/dftracer/utils/call_tree/mpi/filtered_reader.h delete mode 100644 include/dftracer/utils/call_tree/mpi/pid_index_info.h delete mode 100644 include/dftracer/utils/call_tree/mpi/serialization.h create mode 100644 include/dftracer/utils/core/common/memory_budget.h delete mode 100644 include/dftracer/utils/core/rocksdb/async.h create mode 100644 include/dftracer/utils/core/rocksdb/column_families.h create mode 100644 include/dftracer/utils/utilities/common/arrow/ipc_reader.h create mode 100644 include/dftracer/utils/utilities/common/arrow/parallel_reader.h create mode 100644 include/dftracer/utils/utilities/common/arrow/partition_router.h create mode 100644 include/dftracer/utils/utilities/common/arrow/partition_writer.h create mode 100644 include/dftracer/utils/utilities/common/json/parser.h create mode 100644 include/dftracer/utils/utilities/common/serialization/binary_codec.h create mode 100644 include/dftracer/utils/utilities/common/statistics/timestamp_histogram.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h delete mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h create mode 100644 include/dftracer/utils/utilities/composites/dft/args_map.h create mode 100644 include/dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h create mode 100644 include/dftracer/utils/utilities/composites/dft/dft_event_visitor.h create mode 100644 include/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h create mode 100644 include/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h create mode 100644 include/dftracer/utils/utilities/composites/dft/parse_inflated.h create mode 100644 include/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h create mode 100644 include/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h create mode 100644 include/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h create mode 100644 include/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h create mode 100644 include/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h create mode 100644 include/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h create mode 100644 include/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h create mode 100644 include/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h create mode 100644 include/dftracer/utils/utilities/fileio/parallel/layout.h create mode 100644 include/dftracer/utils/utilities/fileio/parallel/merge.h create mode 100644 include/dftracer/utils/utilities/fileio/parallel/parallel_writer.h create mode 100644 include/dftracer/utils/utilities/indexer/file_partition.h create mode 100644 include/dftracer/utils/utilities/indexer/index_batch_sink.h create mode 100644 include/dftracer/utils/utilities/indexer/index_database_sst_writer_context.h create mode 100644 include/dftracer/utils/utilities/indexer/index_database_writer_context.h create mode 100644 include/dftracer/utils/utilities/indexer/index_file_entry_capability.h create mode 100644 include/dftracer/utils/utilities/indexer/index_types.h create mode 100644 include/dftracer/utils/utilities/indexer/internal/index_encoding.h create mode 100644 include/dftracer/utils/utilities/indexer/internal/payload_codec.h create mode 100644 include/dftracer/utils/utilities/indexer/internal/statistics_codec.h delete mode 100644 include/dftracer/utils/utilities/indexer/visitors/bloom_visitor.h delete mode 100644 include/dftracer/utils/utilities/indexer/visitors/manifest_visitor.h create mode 100644 python/dftracer/utils/indexer.py create mode 100644 python/dftracer/utils/trace_reader.py create mode 100644 src/dftracer/utils/binaries/common_cli.h create mode 100644 src/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp create mode 100644 src/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp create mode 100644 src/dftracer/utils/core/common/memory_budget.cpp delete mode 100644 src/dftracer/utils/core/rocksdb/async.cpp create mode 100644 src/dftracer/utils/python/arrow_parallel_reader.cpp create mode 100644 src/dftracer/utils/python/arrow_parallel_reader.h create mode 100644 src/dftracer/utils/python/arrow_stream_capsule.cpp create mode 100644 src/dftracer/utils/python/arrow_stream_capsule.h create mode 100644 src/dftracer/utils/python/batch_byte_size.h create mode 100644 src/dftracer/utils/python/batch_indexer.cpp create mode 100644 src/dftracer/utils/python/batch_indexer.h create mode 100644 src/dftracer/utils/python/index_database.cpp create mode 100644 src/dftracer/utils/python/index_database.h create mode 100644 src/dftracer/utils/python/memoryview_batch.cpp create mode 100644 src/dftracer/utils/python/memoryview_batch.h create mode 100644 src/dftracer/utils/python/schema_reconcile.cpp create mode 100644 src/dftracer/utils/python/schema_reconcile.h create mode 100644 src/dftracer/utils/python/sst_distribution.cpp create mode 100644 src/dftracer/utils/python/sst_distribution.h create mode 100644 src/dftracer/utils/python/streaming_iterator.cpp create mode 100644 src/dftracer/utils/python/streaming_iterator.h create mode 100644 src/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp create mode 100644 src/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp create mode 100644 src/dftracer/utils/utilities/common/arrow/ipc_reader.cpp create mode 100644 src/dftracer/utils/utilities/common/arrow/parallel_reader.cpp create mode 100644 src/dftracer/utils/utilities/common/arrow/partition_router.cpp create mode 100644 src/dftracer/utils/utilities/common/arrow/partition_writer.cpp create mode 100644 src/dftracer/utils/utilities/common/json/parser.cpp create mode 100644 src/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp create mode 100644 src/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp create mode 100644 src/dftracer/utils/utilities/fileio/parallel/layout.cpp create mode 100644 src/dftracer/utils/utilities/fileio/parallel/merge.cpp create mode 100644 src/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp create mode 100644 src/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp create mode 100644 src/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp create mode 100644 src/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp create mode 100644 src/dftracer/utils/utilities/indexer/index_database_writer_context.cpp create mode 100644 src/dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h create mode 100644 src/dftracer/utils/utilities/indexer/internal/index_batch_writer.h create mode 100644 src/dftracer/utils/utilities/indexer/internal/index_encoding.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp create mode 100644 tests/binaries/test_dftracer_aggregator_mpi.cpp create mode 100644 tests/binaries/test_dftracer_call_tree.cpp create mode 100644 tests/binaries/test_dftracer_call_tree_mpi.cpp create mode 100644 tests/python/test_distributed_manifest.py create mode 100644 tests/python/test_trace_reader_directory.py create mode 100644 tests/python/test_trace_reader_write_arrow.py create mode 100644 tests/replay/test_replay_fidelity.cpp create mode 100644 tests/utilities/common/arrow/test_arrow_ipc_reader.cpp create mode 100644 tests/utilities/common/statistics/test_timestamp_histogram.cpp create mode 100644 tests/utilities/composites/dft/aggregators/test_aggregation_augmentation.cpp create mode 100644 tests/utilities/composites/dft/aggregators/test_aggregation_serialization.cpp create mode 100644 tests/utilities/composites/dft/aggregators/test_system_metrics.cpp create mode 100644 tests/utilities/composites/dft/aggregators/test_system_metrics_merge_operator.cpp create mode 100644 tests/utilities/fileio/parallel/test_layout_sizing.cpp create mode 100644 tests/utilities/fileio/parallel/test_padded_striped_writer.cpp create mode 100644 tests/utilities/fileio/parallel/test_sharded_writer.cpp create mode 100644 tests/utilities/fileio/parallel/test_striped_writer.cpp create mode 100644 tests/utilities/indexer/test_sst_ingest_spike.cpp diff --git a/.envrc b/.envrc deleted file mode 100644 index 3550a30f..00000000 --- a/.envrc +++ /dev/null @@ -1 +0,0 @@ -use flake diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ef4b04e..00ed292a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,10 @@ name: CI on: push: - branches: [ main, develop, initialize, 'feat/**', 'fix/**', 'chore/**' ] + branches: + [main, develop, initialize, "feat/**", "fix/**", "chore/**", "perf/**"] pull_request: - branches: [ main, develop ] + branches: [main, develop] workflow_dispatch: jobs: @@ -13,22 +14,22 @@ jobs: outputs: code: ${{ steps.filter.outputs.code }} steps: - - uses: actions/checkout@v6 - - uses: dorny/paths-filter@v3.0.2 - id: filter - with: - filters: | - code: - - 'include/**' - - 'src/**' - - 'tests/**' - - 'python/**' - - 'cmake/**' - - 'CMakeLists.txt' - - 'CMakePresets.json' - - 'pyproject.toml' - - 'Makefile' - - '.github/workflows/ci.yml' + - uses: actions/checkout@v6 + - uses: dorny/paths-filter@v3.0.2 + id: filter + with: + filters: | + code: + - 'include/**' + - 'src/**' + - 'tests/**' + - 'python/**' + - 'cmake/**' + - 'CMakeLists.txt' + - 'CMakePresets.json' + - 'pyproject.toml' + - 'Makefile' + - '.github/workflows/ci.yml' test: needs: changes @@ -42,82 +43,82 @@ jobs: matrix: os: [ubuntu-22.04, ubuntu-24.04, ubuntu-latest, macos-latest] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] - + steps: - - uses: actions/checkout@v6 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v6.1.0 - with: - python-version: ${{ matrix.python-version }} + - uses: actions/checkout@v6 - - name: Cache ccache - uses: actions/cache@v5 - with: - path: ~/.ccache - key: ccache-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('CMakeLists.txt', 'pyproject.toml', '.github/workflows/ci.yml') }} - restore-keys: | - ccache-${{ runner.os }}-${{ matrix.python-version }}- - ccache-${{ runner.os }}- - - - name: Install dependencies (Ubuntu) - if: runner.os == 'Linux' - run: | - sudo apt-get update - sudo apt-get install -y build-essential cmake ccache lcov zlib1g-dev libsqlite3-dev pkg-config ninja-build - - - name: Install dependencies (macOS) - if: runner.os == 'macOS' - run: | - brew update - for f in cmake ccache lcov zlib sqlite pkg-config ninja; do - if brew list --versions "$f" >/dev/null; then - echo "$f already installed" - else - brew install "$f" - fi - done + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6.1.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache ccache + uses: actions/cache@v5 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('CMakeLists.txt', 'pyproject.toml', '.github/workflows/ci.yml') }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.python-version }}- + ccache-${{ runner.os }}- + + - name: Install dependencies (Ubuntu) + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y build-essential cmake ccache lcov zlib1g-dev libsqlite3-dev pkg-config ninja-build - - name: Run coverage - if: (matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12' - run: | - make coverage + - name: Install dependencies (macOS) + if: runner.os == 'macOS' + run: | + brew update + for f in cmake ccache lcov zlib sqlite pkg-config ninja; do + if brew list --versions "$f" >/dev/null; then + echo "$f already installed" + else + brew install "$f" + fi + done - - name: Run test (Unix) - if: "!((matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12')" - run: | - make test - - - name: Run Python tests (with venv) - if: "!((matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12')" - run: | - if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.python-version }}" = "3.12" ]; then - make test-py RUN_TY=1 - else - make test-py - fi + - name: Run coverage + if: (matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12' + run: | + make coverage + + - name: Run test (Unix) + if: "!((matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12')" + run: | + make test + + - name: Run Python tests (with venv) + if: "!((matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12')" + run: | + if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.python-version }}" = "3.12" ]; then + make test-py RUN_TY=1 + else + make test-py + fi - - name: Upload coverage reports to Coveralls - if: (matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12' - uses: coverallsapp/github-action@v2.3.6 - continue-on-error: true - with: - file: coverage/coverage_filtered.info - format: lcov - flag-name: ${{ matrix.os }} - parallel: true - env: - COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }} + - name: Upload coverage reports to Coveralls + if: (matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12' + uses: coverallsapp/github-action@v2.3.6 + continue-on-error: true + with: + file: coverage/coverage_filtered.info + format: lcov + flag-name: ${{ matrix.os }} + parallel: true + env: + COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }} coverage-finish: needs: test if: always() runs-on: ubuntu-latest steps: - - name: Coveralls finished - uses: coverallsapp/github-action@v2.3.6 - continue-on-error: true - with: - parallel-finished: true - env: - COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }} + - name: Coveralls finished + uses: coverallsapp/github-action@v2.3.6 + continue-on-error: true + with: + parallel-finished: true + env: + COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }} diff --git a/.gitignore b/.gitignore index 3cb352f7..e4ec0d8a 100644 --- a/.gitignore +++ b/.gitignore @@ -94,3 +94,4 @@ PLANS.md docs/plans profiling-results*/ +dfanalyzer/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 238c0f41..9985f689 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,8 +65,11 @@ option(DFTRACER_UTILS_ENABLE_ASAN "Enable AddressSanitizer" OFF) option(DFTRACER_UTILS_ENABLE_UBSAN "Enable UndefinedBehaviorSanitizer" OFF) option(DFTRACER_UTILS_ENABLE_TSAN "Enable ThreadSanitizer" OFF) option(DFTRACER_UTILS_ENABLE_MPI "Enable MPI support for call tree" OFF) +option(DFTRACER_USE_ZLIB_NG "Use zlib-ng (compat) instead of madler/zlib; falls back to madler on failure" ON) option(DFTRACER_UTILS_ENABLE_ARROW "Enable Arrow C Data Interface via nanoarrow" ON) option(DFTRACER_UTILS_ENABLE_ARROW_IPC "Enable Arrow IPC file read/write via nanoarrow" ON) +option(DFTRACER_UTILS_ENABLE_ZSTD "Enable ZSTD compression for RocksDB" ON) +option(DFTRACER_UTILS_ENABLE_LZ4 "Enable LZ4 compression for RocksDB" OFF) if(DFTRACER_UTILS_TESTS) message(STATUS "Building tests") @@ -136,6 +139,18 @@ else() message(STATUS "kqueue support: disabled (sys/event.h not found)") endif() +# lustreapi: optional for stripe_count / stripe_size queries. When absent, the +# parallel writer treats Lustre like an opaque POSIX filesystem. +check_include_file("lustre/lustreapi.h" HAVE_LUSTRE_LUSTREAPI_H) +find_library(LUSTREAPI_LIBRARY NAMES lustreapi) +if(HAVE_LUSTRE_LUSTREAPI_H AND LUSTREAPI_LIBRARY) + set(DFTRACER_UTILS_HAVE_LUSTREAPI ON) + message(STATUS "lustreapi support: enabled (${LUSTREAPI_LIBRARY})") +else() + set(DFTRACER_UTILS_HAVE_LUSTREAPI OFF) + message(STATUS "lustreapi support: disabled (header or library not found)") +endif() + # Set C++ standard set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) diff --git a/CMakePresets.json b/CMakePresets.json index dfb7deab..e3c0079a 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -77,7 +77,8 @@ "inherits": "dev", "cacheVariables": { "DFTRACER_UTILS_TESTS": "ON", - "DFTRACER_UTILS_DEBUG": "ON" + "DFTRACER_UTILS_DEBUG": "ON", + "DFTRACER_UTILS_BUILD_PYTHON": "ON" } }, { diff --git a/cmake/configure_files/dftracer_utils_config.dbg.h.in b/cmake/configure_files/dftracer_utils_config.dbg.h.in index 114d35cb..bd8a58f9 100644 --- a/cmake/configure_files/dftracer_utils_config.dbg.h.in +++ b/cmake/configure_files/dftracer_utils_config.dbg.h.in @@ -31,6 +31,13 @@ /* Macro flags */ #cmakedefine DFTRACER_UTILS_HAS_STD_FILESYSTEM 1 +/* Feature flags */ +#cmakedefine DFTRACER_UTILS_ENABLE_ARROW 1 +#cmakedefine DFTRACER_UTILS_ENABLE_ARROW_IPC 1 +#cmakedefine DFTRACER_UTILS_ENABLE_LZ4 1 +#cmakedefine DFTRACER_UTILS_ENABLE_ZSTD 1 +#cmakedefine DFTRACER_UTILS_HAVE_LUSTREAPI 1 + #define DFTRACER_UTILS_LOGGER_CPP_LOGGER 1 #define DFTRACER_UTILS_LOGGER_LEVEL_TRACE 1 #define DFTRACER_UTILS_LOGGER_LEVEL_DEBUG 1 diff --git a/cmake/configure_files/dftracer_utils_config.h.in b/cmake/configure_files/dftracer_utils_config.h.in index 7c4482cc..486f59e7 100644 --- a/cmake/configure_files/dftracer_utils_config.h.in +++ b/cmake/configure_files/dftracer_utils_config.h.in @@ -31,6 +31,13 @@ /* Macro flags */ #cmakedefine DFTRACER_UTILS_HAS_STD_FILESYSTEM 1 +/* Feature flags */ +#cmakedefine DFTRACER_UTILS_ENABLE_ARROW 1 +#cmakedefine DFTRACER_UTILS_ENABLE_ARROW_IPC 1 +#cmakedefine DFTRACER_UTILS_ENABLE_LZ4 1 +#cmakedefine DFTRACER_UTILS_ENABLE_ZSTD 1 +#cmakedefine DFTRACER_UTILS_HAVE_LUSTREAPI 1 + #define DFTRACER_UTILS_LOGGER_CPP_LOGGER 1 #define DFTRACER_UTILS_LOGGER_LEVEL_TRACE 0 #define DFTRACER_UTILS_LOGGER_LEVEL_DEBUG 0 diff --git a/cmake/modules/Dependencies.cmake b/cmake/modules/Dependencies.cmake index 2c5ccbfc..454b2ae4 100644 --- a/cmake/modules/Dependencies.cmake +++ b/cmake/modules/Dependencies.cmake @@ -273,6 +273,47 @@ function(need_nonstd_span) endif() endfunction() +function(need_unordered_dense) + if(NOT unordered_dense_ADDED) + cpmaddpackage( + NAME + unordered_dense + GITHUB_REPOSITORY + martinus/unordered_dense + VERSION + 4.4.0 + OPTIONS + "UNORDERED_DENSE_INSTALL ON" + FORCE + YES) + endif() +endfunction() + +function(link_unordered_dense TARGET_NAME) + if(NOT TARGET_NAME) + message(FATAL_ERROR "link_unordered_dense: TARGET_NAME is required") + endif() + + if(NOT TARGET ${TARGET_NAME}) + message( + FATAL_ERROR + "link_unordered_dense: Target '${TARGET_NAME}' does not exist") + endif() + + if(NOT TARGET unordered_dense::unordered_dense) + message( + FATAL_ERROR + "link_unordered_dense: ankerl::unordered_dense not found! Call need_unordered_dense() first." + ) + endif() + + get_target_property(UD_INC unordered_dense::unordered_dense + INTERFACE_INCLUDE_DIRECTORIES) + target_include_directories(${TARGET_NAME} PUBLIC + "$" + "$") +endfunction() + function(need_tl_expected) # tl::expected is only needed when C++23 std::expected is unavailable if(CMAKE_CXX_STANDARD GREATER_EQUAL 23) @@ -353,80 +394,94 @@ function(link_tl_expected TARGET_NAME) endfunction() # ============================================================================== -# JSON and Serialization Dependencies +# simdjson - SIMD-accelerated JSON parser (On-Demand API for zero-copy) # ============================================================================== -function(need_yyjson) - if(NOT yyjson_ADDED) +function(need_simdjson) + if(NOT simdjson_ADDED) cpmaddpackage( NAME - yyjson + simdjson GITHUB_REPOSITORY - ibireme/yyjson + simdjson/simdjson VERSION - 0.12.0 + 4.6.1 GIT_TAG - 0.12.0 - FORCE - YES + v4.6.1 DOWNLOAD_ONLY YES) endif() - set(YYJSON_SOVERSION 0) - set(YYJSON_TARGETS) + if(simdjson_ADDED AND NOT TARGET simdjson) + message(STATUS "Building simdjson library (v4.6.1)") - if(DFTRACER_UTILS_BUILD_STATIC) - add_library(yyjson_static STATIC ${yyjson_SOURCE_DIR}/src/yyjson.h - ${yyjson_SOURCE_DIR}/src/yyjson.c) - target_include_directories( - yyjson_static PUBLIC $) - set_target_properties( - yyjson_static - PROPERTIES VERSION ${PROJECT_VERSION} - SOVERSION ${YYJSON_SOVERSION} - ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) - add_library(yyjson::yyjson_static ALIAS yyjson_static) - list(APPEND YYJSON_TARGETS yyjson_static) - message(STATUS "Added yyjson static library") - endif() + # simdjson is a single-header + single-source library + set(SIMDJSON_SOURCES + ${simdjson_SOURCE_DIR}/singleheader/simdjson.h + ${simdjson_SOURCE_DIR}/singleheader/simdjson.cpp) - if(DFTRACER_UTILS_BUILD_SHARED) - add_library(yyjson_shared SHARED ${yyjson_SOURCE_DIR}/src/yyjson.h - ${yyjson_SOURCE_DIR}/src/yyjson.c) - target_include_directories( - yyjson_shared PUBLIC $) - set_target_properties( - yyjson_shared - PROPERTIES VERSION ${PROJECT_VERSION} - SOVERSION ${YYJSON_SOVERSION} - OUTPUT_NAME yyjson - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib - ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) - add_library(yyjson::yyjson ALIAS yyjson_shared) - list(APPEND YYJSON_TARGETS yyjson_shared) - message(STATUS "Added yyjson shared library") - elseif(DFTRACER_UTILS_BUILD_STATIC) - # If only static is built, make it the default alias - add_library(yyjson::yyjson ALIAS yyjson_static) - endif() + set(SIMDJSON_TARGETS) - install(FILES ${yyjson_SOURCE_DIR}/src/yyjson.h - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) - if(YYJSON_TARGETS) - install( - TARGETS ${YYJSON_TARGETS} - EXPORT yyjsonTargets - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + if(DFTRACER_UTILS_BUILD_STATIC) + add_library(simdjson_static STATIC ${SIMDJSON_SOURCES}) + target_include_directories( + simdjson_static SYSTEM PUBLIC + $ + $) + target_compile_features(simdjson_static PUBLIC cxx_std_17) + # Suppress warnings from simdjson (third-party code) + target_compile_options(simdjson_static PRIVATE -w) + set_target_properties( + simdjson_static + PROPERTIES + OUTPUT_NAME simdjson + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + POSITION_INDEPENDENT_CODE ON) + add_library(simdjson::simdjson_static ALIAS simdjson_static) + list(APPEND SIMDJSON_TARGETS simdjson_static) + message(STATUS "Added simdjson static library") + endif() - # Install the export set so other projects can find yyjson - install( - EXPORT yyjsonTargets - FILE yyjsonTargets.cmake - NAMESPACE yyjson:: - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/yyjson) + if(DFTRACER_UTILS_BUILD_SHARED) + add_library(simdjson_shared SHARED ${SIMDJSON_SOURCES}) + target_include_directories( + simdjson_shared SYSTEM PUBLIC + $ + $) + target_compile_features(simdjson_shared PUBLIC cxx_std_17) + # Suppress warnings from simdjson (third-party code) + target_compile_options(simdjson_shared PRIVATE -w) + set_target_properties( + simdjson_shared + PROPERTIES + OUTPUT_NAME simdjson + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + add_library(simdjson::simdjson ALIAS simdjson_shared) + list(APPEND SIMDJSON_TARGETS simdjson_shared) + message(STATUS "Added simdjson shared library") + elseif(DFTRACER_UTILS_BUILD_STATIC) + add_library(simdjson::simdjson ALIAS simdjson_static) + endif() + + # Install header + install(FILES ${simdjson_SOURCE_DIR}/singleheader/simdjson.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + + if(SIMDJSON_TARGETS) + install( + TARGETS ${SIMDJSON_TARGETS} + EXPORT simdjsonTargets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + + install( + EXPORT simdjsonTargets + FILE simdjsonTargets.cmake + NAMESPACE simdjson:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/simdjson) + endif() endif() endfunction() @@ -497,13 +552,13 @@ function(need_rocksdb) "ROCKSDB_BUILD_SHARED ${DFTRACER_UTILS_BUILD_SHARED}" "WITH_TESTS OFF" "WITH_TOOLS OFF" - "WITH_CORE_TOOLS OFF" + "WITH_CORE_TOOLS ON" "WITH_BENCHMARK_TOOLS OFF" "WITH_GFLAGS OFF" "WITH_SNAPPY OFF" - "WITH_LZ4 ON" + "WITH_LZ4 ${DFTRACER_UTILS_ENABLE_LZ4}" "WITH_ZLIB ON" - "WITH_ZSTD OFF" + "WITH_ZSTD ${DFTRACER_UTILS_ENABLE_ZSTD}" "WITH_BZ2 OFF" "USE_RTTI ON" "FAIL_ON_WARNINGS OFF" @@ -587,6 +642,23 @@ function(need_rocksdb) "${CMAKE_INSTALL_RPATH}" PARENT_SCOPE) + # Stage rocksdb's ldb (and sst_dump) into bin/ and reuse the standard + # $ORIGIN/../lib rpath helper so they find librocksdb.so without + # LD_LIBRARY_PATH. Install alongside our own binaries and ship a + # venv wrapper when building a Python wheel. + foreach(tool ldb sst_dump) + if(TARGET ${tool}) + set_target_properties( + ${tool} PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${CMAKE_BINARY_DIR}/bin") + target_add_rpath(${tool}) + install(TARGETS ${tool} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + if(SKBUILD) + create_python_wrapper(${tool}) + endif() + endif() + endforeach() + set(RocksDB_FOUND TRUE PARENT_SCOPE) @@ -840,7 +912,130 @@ function(need_lz4) endif() endfunction() +function(_try_zlib_ng OUT_VAR) + set(${OUT_VAR} + FALSE + PARENT_SCOPE) + + cpmaddpackage( + NAME + zlib-ng + GITHUB_REPOSITORY + zlib-ng/zlib-ng + VERSION + 2.3.3 + GIT_TAG + 2.3.3 + OPTIONS + "ZLIB_COMPAT ON" + "ZLIB_ENABLE_TESTS OFF" + "ZLIBNG_ENABLE_TESTS OFF" + "WITH_GTEST OFF" + "WITH_OPTIM ON" + "WITH_NEW_STRATEGIES ON" + "WITH_NATIVE_INSTRUCTIONS OFF" + "INSTALL_UTILS OFF" + "SKIP_INSTALL_ALL ON") + + if(NOT zlib-ng_ADDED) + message(WARNING "zlib-ng CPM add failed; will fall back to madler/zlib") + return() + endif() + + # zlib-ng compat mode: real targets are `zlib-ng` (shared) and + # `zlib-ng-static` (static); `zlib`/`zlibstatic` are ALIAS-only and cannot + # have properties or further aliases set on them. + set(ZLIB_NG_TARGETS) + if(DFTRACER_UTILS_BUILD_SHARED AND TARGET zlib-ng) + get_target_property(_zng_type zlib-ng TYPE) + if(_zng_type STREQUAL "SHARED_LIBRARY") + set_target_properties( + zlib-ng PROPERTIES OUTPUT_NAME dftracer_zlib LIBRARY_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/lib) + target_include_directories( + zlib-ng PUBLIC $) + add_library(dftracer_zlib_shared ALIAS zlib-ng) + add_library(dftracer::zlib ALIAS zlib-ng) + list(APPEND ZLIB_NG_TARGETS zlib-ng) + message(STATUS "Using zlib-ng (compat, shared) as dftracer_zlib") + endif() + endif() + + if(DFTRACER_UTILS_BUILD_STATIC AND TARGET zlib-ng-static) + set_target_properties( + zlib-ng-static PROPERTIES OUTPUT_NAME dftracer_zlib + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + target_include_directories( + zlib-ng-static PUBLIC $) + add_library(dftracer_zlib_static ALIAS zlib-ng-static) + add_library(dftracer::zlibstatic ALIAS zlib-ng-static) + if(NOT TARGET dftracer::zlib) + add_library(dftracer::zlib ALIAS zlib-ng-static) + endif() + list(APPEND ZLIB_NG_TARGETS zlib-ng-static) + message(STATUS "Using zlib-ng (compat, static) as dftracer_zlib") + endif() + + if(NOT ZLIB_NG_TARGETS) + message(WARNING "zlib-ng targets not found after CPM add; falling back") + return() + endif() + + install( + TARGETS ${ZLIB_NG_TARGETS} + EXPORT ZlibTargets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + install( + EXPORT ZlibTargets + FILE ZlibTargets.cmake + NAMESPACE dftracer:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/zlib) + + # Compat headers: zlib-ng generates zlib.h/zconf.h in its binary dir when + # ZLIB_COMPAT=ON. Fall back to source dir if generated copy is absent. + foreach(hdr zlib.h zconf.h) + if(EXISTS "${zlib-ng_BINARY_DIR}/${hdr}") + install(FILES "${zlib-ng_BINARY_DIR}/${hdr}" + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + elseif(EXISTS "${zlib-ng_SOURCE_DIR}/${hdr}") + install(FILES "${zlib-ng_SOURCE_DIR}/${hdr}" + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + endif() + endforeach() + + set(ZLIB_SOURCE_DIR + ${zlib-ng_SOURCE_DIR} + PARENT_SCOPE) + set(ZLIB_BINARY_DIR + ${zlib-ng_BINARY_DIR} + PARENT_SCOPE) + set(${OUT_VAR} + TRUE + PARENT_SCOPE) +endfunction() + function(need_zlib) + if(DFTRACER_USE_ZLIB_NG) + _try_zlib_ng(_ZLIB_NG_OK) + if(_ZLIB_NG_OK) + set(ZLIB_CPM + TRUE + PARENT_SCOPE) + set(ZLIB_SOURCE_DIR + ${ZLIB_SOURCE_DIR} + PARENT_SCOPE) + set(ZLIB_BINARY_DIR + ${ZLIB_BINARY_DIR} + PARENT_SCOPE) + set(ZLIB_FOUND + FALSE + PARENT_SCOPE) + return() + endif() + endif() + find_package(ZLIB 1.2 QUIET) if(ZLIB_FOUND) @@ -1132,50 +1327,119 @@ function(link_zlib TARGET_NAME LIBRARY_TYPE) endif() endfunction() +function(need_zstd) + find_package(zstd QUIET CONFIG) + if(NOT zstd_FOUND) + find_path(zstd_INCLUDE_DIRS NAMES zstd.h) + find_library(zstd_LIBRARIES NAMES zstd) + if(zstd_INCLUDE_DIRS AND zstd_LIBRARIES) + set(zstd_FOUND TRUE) + endif() + endif() + + if(zstd_FOUND) + message(STATUS "Found system zstd") + if(NOT TARGET zstd::libzstd_shared AND NOT TARGET zstd::libzstd_static) + if(DEFINED zstd_LIBRARIES) + add_library(zstd::libzstd_shared UNKNOWN IMPORTED) + set_target_properties( + zstd::libzstd_shared + PROPERTIES IMPORTED_LOCATION "${zstd_LIBRARIES}" + INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIRS}") + endif() + endif() + set(zstd_FOUND + TRUE + PARENT_SCOPE) + set(zstd_CPM + FALSE + PARENT_SCOPE) + else() + if(NOT zstd_ADDED) + cpmaddpackage( + NAME + zstd + GITHUB_REPOSITORY + facebook/zstd + VERSION + 1.5.7 + GIT_TAG + v1.5.7 + SOURCE_SUBDIR + build/cmake + OPTIONS + "ZSTD_BUILD_PROGRAMS OFF" + "ZSTD_BUILD_TESTS OFF" + "ZSTD_BUILD_SHARED ${DFTRACER_UTILS_BUILD_SHARED}" + "ZSTD_BUILD_STATIC ON") + endif() + + if(zstd_ADDED) + message(STATUS "Built zstd with CPM") + set(zstd_FOUND + TRUE + PARENT_SCOPE) + set(zstd_CPM + TRUE + PARENT_SCOPE) + set(zstd_FOUND + TRUE + CACHE BOOL "zstd availability" FORCE) + endif() + endif() +endfunction() + # ============================================================================== # Hashing and Cryptography Dependencies # ============================================================================== -function(link_yyjson TARGET_NAME LIBRARY_TYPE) +function(link_simdjson TARGET_NAME LIBRARY_TYPE) # Validate parameters if(NOT TARGET_NAME) - message(FATAL_ERROR "link_yyjson: TARGET_NAME is required") + message(FATAL_ERROR "link_simdjson: TARGET_NAME is required") endif() if(NOT LIBRARY_TYPE MATCHES "^(STATIC|SHARED)$") message( - FATAL_ERROR "link_yyjson: LIBRARY_TYPE must be either STATIC or SHARED") + FATAL_ERROR "link_simdjson: LIBRARY_TYPE must be either STATIC or SHARED") endif() if(NOT TARGET ${TARGET_NAME}) - message(FATAL_ERROR "link_yyjson: Target '${TARGET_NAME}' does not exist") + message(FATAL_ERROR "link_simdjson: Target '${TARGET_NAME}' does not exist") endif() - # Link appropriate yyjson variant Use PUBLIC linkage since yyjson headers may - # be included in public headers + # Link appropriate simdjson variant if(LIBRARY_TYPE STREQUAL "STATIC") - # For static libraries, prefer static yyjson if available - if(TARGET yyjson_static) - target_link_libraries(${TARGET_NAME} PUBLIC yyjson::yyjson_static) - message(STATUS "Linked ${TARGET_NAME} to yyjson_static") - elseif(TARGET yyjson_shared) - target_link_libraries(${TARGET_NAME} PUBLIC yyjson::yyjson) - message(STATUS "Linked ${TARGET_NAME} to yyjson (shared)") + # For static libraries, prefer static simdjson if available + if(TARGET simdjson_static) + target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson_static) + message(STATUS "Linked ${TARGET_NAME} to simdjson_static") + elseif(TARGET simdjson_shared) + target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson) + message(STATUS "Linked ${TARGET_NAME} to simdjson (shared)") + elseif(TARGET simdjson::simdjson) + # System / find_package() simdjson (e.g. Homebrew on macOS). + target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson) + message(STATUS "Linked ${TARGET_NAME} to system simdjson::simdjson") else() message( - FATAL_ERROR "link_yyjson: No yyjson found! Call need_yyjson() first.") + FATAL_ERROR "link_simdjson: No simdjson found! Call need_simdjson() first.") endif() else() # SHARED - # For shared libraries, prefer shared yyjson if available - if(TARGET yyjson_shared) - target_link_libraries(${TARGET_NAME} PUBLIC yyjson::yyjson) - message(STATUS "Linked ${TARGET_NAME} to yyjson (shared)") - elseif(TARGET yyjson_static) - target_link_libraries(${TARGET_NAME} PUBLIC yyjson::yyjson_static) - message(STATUS "Linked ${TARGET_NAME} to yyjson_static") + # For shared libraries, prefer shared simdjson if available + if(TARGET simdjson_shared) + target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson) + message(STATUS "Linked ${TARGET_NAME} to simdjson (shared)") + elseif(TARGET simdjson_static) + target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson_static) + message(STATUS "Linked ${TARGET_NAME} to simdjson_static") + elseif(TARGET simdjson::simdjson) + # System / find_package() simdjson (e.g. Homebrew on macOS). + target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson) + message(STATUS "Linked ${TARGET_NAME} to system simdjson::simdjson") else() message( - FATAL_ERROR "link_yyjson: No yyjson found! Call need_yyjson() first.") + FATAL_ERROR "link_simdjson: No simdjson found! Call need_simdjson() first.") endif() endif() endfunction() @@ -1350,12 +1614,16 @@ function(need_nanoarrow) nanoarrow_static PUBLIC $ $) - endif() - target_compile_definitions(nanoarrow_static - PUBLIC DFTRACER_UTILS_ENABLE_ARROW) - if(DFTRACER_UTILS_ENABLE_ARROW_IPC) - target_compile_definitions(nanoarrow_static - PUBLIC DFTRACER_UTILS_ENABLE_ARROW_IPC) + # Enable zstd compression for Arrow IPC + if(DFTRACER_UTILS_ENABLE_ZSTD) + target_compile_definitions(nanoarrow_static + PRIVATE NANOARROW_IPC_WITH_ZSTD) + if(TARGET zstd::libzstd_static) + target_link_libraries(nanoarrow_static PRIVATE zstd::libzstd_static) + elseif(TARGET zstd::libzstd_shared) + target_link_libraries(nanoarrow_static PRIVATE zstd::libzstd_shared) + endif() + endif() endif() set_target_properties( nanoarrow_static @@ -1380,12 +1648,16 @@ function(need_nanoarrow) nanoarrow_shared PUBLIC $ $) - endif() - target_compile_definitions(nanoarrow_shared - PUBLIC DFTRACER_UTILS_ENABLE_ARROW) - if(DFTRACER_UTILS_ENABLE_ARROW_IPC) - target_compile_definitions(nanoarrow_shared - PUBLIC DFTRACER_UTILS_ENABLE_ARROW_IPC) + # Enable zstd compression for Arrow IPC + if(DFTRACER_UTILS_ENABLE_ZSTD) + target_compile_definitions(nanoarrow_shared + PRIVATE NANOARROW_IPC_WITH_ZSTD) + if(TARGET zstd::libzstd_shared) + target_link_libraries(nanoarrow_shared PRIVATE zstd::libzstd_shared) + elseif(TARGET zstd::libzstd_static) + target_link_libraries(nanoarrow_shared PRIVATE zstd::libzstd_static) + endif() + endif() endif() set_target_properties( nanoarrow_shared diff --git a/cmake/modules/InstallHelpers.cmake b/cmake/modules/InstallHelpers.cmake index 4c776c93..c7f0f442 100644 --- a/cmake/modules/InstallHelpers.cmake +++ b/cmake/modules/InstallHelpers.cmake @@ -196,71 +196,87 @@ else() endif() endif() -# YYJSON dependency -find_library(YYJSON_LIBRARY_BUNDLED - NAMES yyjson libyyjson +# GHC_FILESYSTEM dependency (header-only) +find_path(GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED + NAMES ghc/filesystem.hpp + PATHS \${_IMPORT_PREFIX}/include + NO_DEFAULT_PATH +) + +if(GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED AND NOT TARGET ghc_filesystem) + add_library(ghc_filesystem INTERFACE IMPORTED) + set_target_properties(ghc_filesystem PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES \"\${GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED}\" + ) +else() + # Try to find system ghc_filesystem + find_dependency(ghc_filesystem QUIET) +endif() + +# UNORDERED_DENSE dependency (header-only) +find_path(UNORDERED_DENSE_INCLUDE_DIR_BUNDLED + NAMES ankerl/unordered_dense.h + PATHS \${_IMPORT_PREFIX}/include + NO_DEFAULT_PATH +) + +if(UNORDERED_DENSE_INCLUDE_DIR_BUNDLED AND NOT TARGET unordered_dense::unordered_dense) + add_library(unordered_dense::unordered_dense INTERFACE IMPORTED) + set_target_properties(unordered_dense::unordered_dense PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES \"\${UNORDERED_DENSE_INCLUDE_DIR_BUNDLED}\" + ) +else() + find_dependency(unordered_dense QUIET) +endif() + +# SIMDJSON dependency +find_library(SIMDJSON_LIBRARY_BUNDLED + NAMES simdjson libsimdjson PATHS \${_IMPORT_PREFIX}/lib NO_DEFAULT_PATH ) -if(YYJSON_LIBRARY_BUNDLED) - # Found yyjson that was built with this package - find_path(YYJSON_INCLUDE_DIR_BUNDLED - NAMES yyjson.h +if(SIMDJSON_LIBRARY_BUNDLED) + # Found simdjson that was built with this package + find_path(SIMDJSON_INCLUDE_DIR_BUNDLED + NAMES simdjson.h PATHS \${_IMPORT_PREFIX}/include NO_DEFAULT_PATH ) - if(YYJSON_INCLUDE_DIR_BUNDLED) + if(SIMDJSON_INCLUDE_DIR_BUNDLED) # Create shared target if not exists - if(NOT TARGET yyjson::yyjson) - add_library(yyjson::yyjson UNKNOWN IMPORTED) - set_target_properties(yyjson::yyjson PROPERTIES - IMPORTED_LOCATION \"\${YYJSON_LIBRARY_BUNDLED}\" - INTERFACE_INCLUDE_DIRECTORIES \"\${YYJSON_INCLUDE_DIR_BUNDLED}\" + if(NOT TARGET simdjson::simdjson) + add_library(simdjson::simdjson UNKNOWN IMPORTED) + set_target_properties(simdjson::simdjson PROPERTIES + IMPORTED_LOCATION \"\${SIMDJSON_LIBRARY_BUNDLED}\" + INTERFACE_INCLUDE_DIRECTORIES \"\${SIMDJSON_INCLUDE_DIR_BUNDLED}\" ) endif() # Also look for static version - find_library(YYJSON_STATIC_LIBRARY_BUNDLED - NAMES yyjson_static libyyjson_static + find_library(SIMDJSON_STATIC_LIBRARY_BUNDLED + NAMES simdjson_static libsimdjson_static PATHS \${_IMPORT_PREFIX}/lib NO_DEFAULT_PATH ) - if(YYJSON_STATIC_LIBRARY_BUNDLED AND NOT TARGET yyjson::yyjson_static) - add_library(yyjson::yyjson_static UNKNOWN IMPORTED) - set_target_properties(yyjson::yyjson_static PROPERTIES - IMPORTED_LOCATION \"\${YYJSON_STATIC_LIBRARY_BUNDLED}\" - INTERFACE_INCLUDE_DIRECTORIES \"\${YYJSON_INCLUDE_DIR_BUNDLED}\" + if(SIMDJSON_STATIC_LIBRARY_BUNDLED AND NOT TARGET simdjson::simdjson_static) + add_library(simdjson::simdjson_static UNKNOWN IMPORTED) + set_target_properties(simdjson::simdjson_static PROPERTIES + IMPORTED_LOCATION \"\${SIMDJSON_STATIC_LIBRARY_BUNDLED}\" + INTERFACE_INCLUDE_DIRECTORIES \"\${SIMDJSON_INCLUDE_DIR_BUNDLED}\" ) endif() endif() else() - # Try to find system yyjson (require minimum version 0.10.0) - find_dependency(yyjson 0.10.0 QUIET) - if(NOT yyjson_FOUND) - message(WARNING \"yyjson not found or version too old. Minimum version 0.10.0 is required.\") + # Try to find system simdjson (require minimum version 3.0.0) + find_dependency(simdjson 3.0.0 QUIET) + if(NOT simdjson_FOUND) + message(WARNING \"simdjson not found or version too old. Minimum version 3.0.0 is required.\") endif() endif() -# GHC_FILESYSTEM dependency (header-only) -find_path(GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED - NAMES ghc/filesystem.hpp - PATHS \${_IMPORT_PREFIX}/include - NO_DEFAULT_PATH -) - -if(GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED AND NOT TARGET ghc_filesystem) - add_library(ghc_filesystem INTERFACE IMPORTED) - set_target_properties(ghc_filesystem PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES \"\${GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED}\" - ) -else() - # Try to find system ghc_filesystem - find_dependency(ghc_filesystem QUIET) -endif() - # CPP-LOGGER dependency find_library(CPP_LOGGER_LIBRARY_BUNDLED NAMES cpp-logger libcpp-logger @@ -288,6 +304,151 @@ else() find_dependency(cpp-logger QUIET) endif() +# LZ4 dependency (used by RocksDB) +find_library(LZ4_LIBRARY_BUNDLED + NAMES lz4 liblz4 + PATHS \${_IMPORT_PREFIX}/lib + NO_DEFAULT_PATH +) + +if(LZ4_LIBRARY_BUNDLED) + find_path(LZ4_INCLUDE_DIR_BUNDLED + NAMES lz4.h + PATHS \${_IMPORT_PREFIX}/include + NO_DEFAULT_PATH + ) + + if(LZ4_INCLUDE_DIR_BUNDLED AND NOT TARGET lz4::lz4) + add_library(lz4::lz4 UNKNOWN IMPORTED) + set_target_properties(lz4::lz4 PROPERTIES + IMPORTED_LOCATION \"\${LZ4_LIBRARY_BUNDLED}\" + INTERFACE_INCLUDE_DIRECTORIES \"\${LZ4_INCLUDE_DIR_BUNDLED}\" + ) + endif() +else() + find_dependency(lz4 QUIET) +endif() + +# ZSTD dependency (compression) +find_library(ZSTD_LIBRARY_BUNDLED + NAMES zstd libzstd + PATHS \${_IMPORT_PREFIX}/lib + NO_DEFAULT_PATH +) + +if(ZSTD_LIBRARY_BUNDLED) + find_path(ZSTD_INCLUDE_DIR_BUNDLED + NAMES zstd.h + PATHS \${_IMPORT_PREFIX}/include + NO_DEFAULT_PATH + ) + + if(ZSTD_INCLUDE_DIR_BUNDLED AND NOT TARGET zstd::libzstd_shared) + add_library(zstd::libzstd_shared UNKNOWN IMPORTED) + set_target_properties(zstd::libzstd_shared PROPERTIES + IMPORTED_LOCATION \"\${ZSTD_LIBRARY_BUNDLED}\" + INTERFACE_INCLUDE_DIRECTORIES \"\${ZSTD_INCLUDE_DIR_BUNDLED}\" + ) + endif() + + # Also look for static version + find_library(ZSTD_STATIC_LIBRARY_BUNDLED + NAMES zstd_static libzstd_static + PATHS \${_IMPORT_PREFIX}/lib + NO_DEFAULT_PATH + ) + + if(ZSTD_STATIC_LIBRARY_BUNDLED AND NOT TARGET zstd::libzstd_static) + add_library(zstd::libzstd_static UNKNOWN IMPORTED) + set_target_properties(zstd::libzstd_static PROPERTIES + IMPORTED_LOCATION \"\${ZSTD_STATIC_LIBRARY_BUNDLED}\" + INTERFACE_INCLUDE_DIRECTORIES \"\${ZSTD_INCLUDE_DIR_BUNDLED}\" + ) + endif() +else() + find_dependency(zstd QUIET) +endif() + +# ROCKSDB dependency (database for indexing) +find_library(ROCKSDB_LIBRARY_BUNDLED + NAMES rocksdb librocksdb + PATHS \${_IMPORT_PREFIX}/lib + NO_DEFAULT_PATH +) + +if(ROCKSDB_LIBRARY_BUNDLED) + find_path(ROCKSDB_INCLUDE_DIR_BUNDLED + NAMES rocksdb/db.h + PATHS \${_IMPORT_PREFIX}/include + NO_DEFAULT_PATH + ) + + if(ROCKSDB_INCLUDE_DIR_BUNDLED AND NOT TARGET RocksDB::rocksdb) + add_library(RocksDB::rocksdb UNKNOWN IMPORTED) + set_target_properties(RocksDB::rocksdb PROPERTIES + IMPORTED_LOCATION \"\${ROCKSDB_LIBRARY_BUNDLED}\" + INTERFACE_INCLUDE_DIRECTORIES \"\${ROCKSDB_INCLUDE_DIR_BUNDLED}\" + ) + endif() + + # Also look for static version + find_library(ROCKSDB_STATIC_LIBRARY_BUNDLED + NAMES rocksdb_static librocksdb_static rocksdb + PATHS \${_IMPORT_PREFIX}/lib + NO_DEFAULT_PATH + ) + + if(ROCKSDB_STATIC_LIBRARY_BUNDLED AND NOT TARGET RocksDB::rocksdb-shared) + add_library(RocksDB::rocksdb-shared UNKNOWN IMPORTED) + set_target_properties(RocksDB::rocksdb-shared PROPERTIES + IMPORTED_LOCATION \"\${ROCKSDB_STATIC_LIBRARY_BUNDLED}\" + INTERFACE_INCLUDE_DIRECTORIES \"\${ROCKSDB_INCLUDE_DIR_BUNDLED}\" + ) + endif() +else() + find_dependency(RocksDB QUIET) +endif() + +# NANOARROW dependency (Arrow support) +find_library(NANOARROW_LIBRARY_BUNDLED + NAMES nanoarrow libnanoarrow + PATHS \${_IMPORT_PREFIX}/lib + NO_DEFAULT_PATH +) + +if(NANOARROW_LIBRARY_BUNDLED) + find_path(NANOARROW_INCLUDE_DIR_BUNDLED + NAMES nanoarrow/nanoarrow.h + PATHS \${_IMPORT_PREFIX}/include + NO_DEFAULT_PATH + ) + + if(NANOARROW_INCLUDE_DIR_BUNDLED AND NOT TARGET nanoarrow::nanoarrow) + add_library(nanoarrow::nanoarrow UNKNOWN IMPORTED) + set_target_properties(nanoarrow::nanoarrow PROPERTIES + IMPORTED_LOCATION \"\${NANOARROW_LIBRARY_BUNDLED}\" + INTERFACE_INCLUDE_DIRECTORIES \"\${NANOARROW_INCLUDE_DIR_BUNDLED}\" + ) + endif() + + # Also look for static version + find_library(NANOARROW_STATIC_LIBRARY_BUNDLED + NAMES nanoarrow_static libnanoarrow_static + PATHS \${_IMPORT_PREFIX}/lib + NO_DEFAULT_PATH + ) + + if(NANOARROW_STATIC_LIBRARY_BUNDLED AND NOT TARGET nanoarrow::nanoarrow_static) + add_library(nanoarrow::nanoarrow_static UNKNOWN IMPORTED) + set_target_properties(nanoarrow::nanoarrow_static PROPERTIES + IMPORTED_LOCATION \"\${NANOARROW_STATIC_LIBRARY_BUNDLED}\" + INTERFACE_INCLUDE_DIRECTORIES \"\${NANOARROW_INCLUDE_DIR_BUNDLED}\" + ) + endif() +else() + find_dependency(nanoarrow QUIET) +endif() + # Include the targets file include(\"\${CMAKE_CURRENT_LIST_DIR}/${PKG_TARGET}Targets.cmake\") diff --git a/cmake/modules/PrecompiledHeader.cmake b/cmake/modules/PrecompiledHeader.cmake index 07e90f8e..b668db5b 100644 --- a/cmake/modules/PrecompiledHeader.cmake +++ b/cmake/modules/PrecompiledHeader.cmake @@ -81,9 +81,20 @@ function(detect_common_headers) set(FILTERED_SOURCES "") foreach(SOURCE_FILE ${ALL_SOURCES}) # Exclude Python binding files (only built when DFTRACER_UTILS_BUILD_PYTHON is ON) - if(NOT SOURCE_FILE MATCHES "/python/") - list(APPEND FILTERED_SOURCES "${SOURCE_FILE}") + if(SOURCE_FILE MATCHES "/python/") + continue() endif() + # Exclude MPI-guarded sources when MPI is off. They still live on + # disk and include , which would otherwise land in the PCH + # (MIN_COUNT=2 is easy to hit) and break every non-MPI target + # because no MPI include path is attached. + if(NOT DFTRACER_UTILS_ENABLE_MPI) + if(SOURCE_FILE MATCHES "/mpi/" + OR SOURCE_FILE MATCHES "_mpi\\.(cpp|cc|cxx|h|hpp)$") + continue() + endif() + endif() + list(APPEND FILTERED_SOURCES "${SOURCE_FILE}") endforeach() set(ALL_SOURCES "${FILTERED_SOURCES}") diff --git a/docs/scripts/generate_api_index.py b/docs/scripts/generate_api_index.py index 08404bf0..291e2cc9 100644 --- a/docs/scripts/generate_api_index.py +++ b/docs/scripts/generate_api_index.py @@ -551,12 +551,31 @@ def _generate_dir_index( rel = child[len(dir_path) :].lstrip("/") if dir_path else child entries.append(f"{rel}/index") - # Leaf modules in this directory + # Leaf modules in this directory; ones that collide with a subdir of the + # same name are emitted as "/_namespace" so the namespace page lives + # inside the subdir's toctree (see resolved_filename in generate()). + child_names = {c.rsplit("/", 1)[-1] for c in child_dirs} leaves = sorted(dir_leaves.get(dir_path, []), key=lambda m: m.filename) for mod in leaves: rel = mod.filename[len(dir_path) :].lstrip("/") if dir_path else mod.filename + if rel in child_names: + continue entries.append(rel) + # Also include the namespace overview page when this dir's name was a + # colliding leaf in the parent (file written as "/_namespace.rst"). + if dir_path: + leaf_name = dir_path.rsplit("/", 1)[-1] if "/" in dir_path else dir_path + parent_dir = dir_path.rsplit("/", 1)[0] if "/" in dir_path else "" + parent_leaves = dir_leaves.get(parent_dir, []) + for mod in parent_leaves: + parent_rel = ( + mod.filename[len(parent_dir) :].lstrip("/") if parent_dir else mod.filename + ) + if parent_rel == leaf_name: + entries.insert(0, "_namespace") + break + if entries: lines.append(".. toctree::") lines.append(" :maxdepth: 1") @@ -578,11 +597,25 @@ def _generate_dir_index( lines.append(" - Items") lines.append(" - Namespace") + collisions = { + m.filename + for m in all_modules + if any( + other.filename.startswith(m.filename + "/") + for other in all_modules + if other is not m + ) + } total = 0 for mod in all_modules: count = len(mod.items) total += count - lines.append(f" * - :doc:`{mod.filename}`") + doc_path = ( + f"{mod.filename}/_namespace" + if mod.filename in collisions + else mod.filename + ) + lines.append(f" * - :doc:`{doc_path}`") lines.append(f" - {count}") lines.append(f" - ``{mod.full_ns}``") @@ -610,12 +643,27 @@ def generate(xml_dir: Path, output_dir: Path) -> None: modules = discover_modules(items) + # Detect leaf modules whose filename collides with a sibling subdir: + # e.g. "utilities/composites.rst" + directory "utilities/composites/". + # Re-route those leaves into "/_namespace.rst" so the namespace + # page lives under the subdir's toctree and Sphinx does not orphan it. + dir_paths = {mod.filename.rsplit("/", 1)[0] for mod in modules if "/" in mod.filename} + dir_paths |= { + "/".join(mod.filename.split("/")[: i + 1]) + for mod in modules + for i in range(len(mod.filename.split("/")) - 1) + } + collisions = {mod.filename for mod in modules if mod.filename in dir_paths} + + def resolved_filename(mod: "Module") -> str: + return f"{mod.filename}/_namespace" if mod.filename in collisions else mod.filename + # Generate per-module pages output_dir.mkdir(parents=True, exist_ok=True) - expected_paths = {output_dir / f"{mod.filename}.rst" for mod in modules} + expected_paths = {output_dir / f"{resolved_filename(mod)}.rst" for mod in modules} for mod in modules: rst = generate_module_rst(mod, repo_root, repo_url, source_ref) - out_path = output_dir / f"{mod.filename}.rst" + out_path = output_dir / f"{resolved_filename(mod)}.rst" out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(rst) diff --git a/docs/source/api/indexer.rst b/docs/source/api/indexer.rst index 6be94a3e..3222cf37 100644 --- a/docs/source/api/indexer.rst +++ b/docs/source/api/indexer.rst @@ -1,13 +1,26 @@ Indexer Module ============== -The indexer module provides functionality for indexing and searching gzip trace -files using a root-local ``.dftindex`` store. +The indexer module provides functionality for indexing DFTracer trace files +(``.pfw`` / ``.pfw.gz``) backed by a ``.dftindex`` RocksDB store. The +top-level :class:`~dftracer.utils.Indexer` follows a ``resolve`` / ``build`` +pattern over a directory or file list and exposes the higher index tiers +(checkpoints, bloom filters, manifests, aggregation). +:class:`~dftracer.utils.CheckpointIndexer` is the lower-level single-file +interface used for checkpoint-level operations. Indexer Class ------------- -.. autoclass:: dftracer.utils.Indexer(gz_path: str, index_path: str | None = None, checkpoint_size: int = 1048576, force_rebuild: bool = False, build_bloom: bool = False, build_manifest: bool = False, index_threshold: int = 8388608, runtime: Runtime | None = None) +.. autoclass:: dftracer.utils.Indexer(directory: str = '', files: list[str] | None = None, index_dir: str = '', require_checkpoint: bool = True, require_bloom: bool = True, require_manifest: bool = True, require_aggregation: bool = False, time_interval_ms: float = 5000.0, group_keys: list[str] | None = None, custom_metric_fields: list[str] | None = None, compute_percentiles: bool = False, checkpoint_size: int = 33554432, parallelism: int = 0, force_rebuild: bool = False, runtime: Runtime | None = None) + :members: resolve, build, ensure_indexed, get_checkpoint_indexer, get_hash_table, query_file_pids, query_all_file_pids, query_file_info, iter_aggregation, iter_arrow_dfanalyzer, iter_arrow_dfanalyzer_all + :undoc-members: + :show-inheritance: + +CheckpointIndexer Class +----------------------- + +.. autoclass:: dftracer.utils.CheckpointIndexer(gz_path: str, index_path: str | None = None, checkpoint_size: int = 1048576, force_rebuild: bool = False, build_bloom: bool = False, build_manifest: bool = False, runtime: Runtime | None = None) :members: :undoc-members: :show-inheritance: @@ -20,3 +33,56 @@ IndexerCheckpoint Class :members: :undoc-members: :show-inheritance: + +Distributed Index (SST-based) +----------------------------- + +The distributed-index path lets the coordinator pre-register files, hand out +``file_id`` ranges to workers, and bulk-ingest worker-produced SST artifacts +back into the unified ``.dftindex`` store. + +IndexDatabase +~~~~~~~~~~~~~ + +.. autoclass:: dftracer.utils.dftracer_utils_ext.IndexDatabase(index_path: str) + :members: init_schema, register_files, reserve_file_id_range, bulk_ingest, rebuild_root_summaries, write_agg_global_config, write_agg_file_markers, write_aggregation_tracker + :undoc-members: + +SstArtifactRegistry +~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: dftracer.utils.dftracer_utils_ext.SstArtifactRegistry + :members: append + :undoc-members: + +Module-level Functions +---------------------- + +.. autofunction:: dftracer.utils.dftracer_utils_ext.scan_files + +.. autofunction:: dftracer.utils.dftracer_utils_ext.scan_aggregation_manifest + +.. autofunction:: dftracer.utils.dftracer_utils_ext.build_sst_batch + +.. autofunction:: dftracer.utils.dftracer_utils_ext.plan_lpt_partition + +.. autofunction:: dftracer.utils.dftracer_utils_ext.enumerate_gzip_members + +.. autofunction:: dftracer.utils.dftracer_utils_ext.plan_work_units + +.. autofunction:: dftracer.utils.dftracer_utils_ext.move_artifacts + +.. autofunction:: dftracer.utils.dftracer_utils_ext.enable_aggregation_deterministic_ids + +Dask Helpers +------------ + +The ``dftracer.utils.dask`` module provides Dask-distributed drivers built on +the SST-based primitives above: + +.. autofunction:: dftracer.utils.dask.distributed_index + +.. autofunction:: dftracer.utils.dask.distributed_aggregate + +Dask is an optional dependency -- this module is only importable when +``dask.distributed`` is installed. diff --git a/docs/source/api/reader.rst b/docs/source/api/reader.rst index 743e0d2c..f3160a3c 100644 --- a/docs/source/api/reader.rst +++ b/docs/source/api/reader.rst @@ -1,13 +1,27 @@ JSON Module =========== -The ``JSON`` class provides lazy JSON parsing backed by yyjson. +The ``JsonDictValue`` class is a zero-copy wrapper over a parsed DFTracer JSON +event. It is the element type yielded by +:meth:`~dftracer.utils.TraceReader.iter_json` and +:meth:`~dftracer.utils.TraceReader.read_json`. The underlying bytes are owned +by the C++ reader buffer; call :meth:`JsonDictValue.to_dict` to materialize a +regular Python dict for storage beyond the iterator's lifetime. -JSON Class ----------- +JsonDictValue Class +------------------- -.. autoclass:: dftracer.utils.JSON(json_str: str) - :members: +.. autoclass:: dftracer.utils.JsonDictValue + :members: keys, values, items, get, to_dict :undoc-members: :show-inheritance: - :special-members: __getitem__, __contains__, __str__, __repr__ + :special-members: __getitem__, __contains__, __len__ + +.. code-block:: python + + reader = TraceReader("trace.pfw.gz") + for event in reader.iter_json(): + name = event["name"] # __getitem__ + if "args" in event: # __contains__ + ret = event["args"].get("ret") # nested dict access + owned = event.to_dict() # materialize to plain dict diff --git a/docs/source/api/runtime.rst b/docs/source/api/runtime.rst index 8d357dad..b6b9da88 100644 --- a/docs/source/api/runtime.rst +++ b/docs/source/api/runtime.rst @@ -8,12 +8,16 @@ Pipeline/DAG overhead. Runtime Class ------------- -.. autoclass:: dftracer.utils.Runtime +.. autoclass:: dftracer.utils.Runtime(threads: int = 0, io_threads: int = 0) :members: :undoc-members: :show-inheritance: :special-members: __enter__, __exit__ +The ``threads`` argument sizes the compute pool; ``io_threads`` sizes a +separate pool dedicated to blocking I/O tasks. Both default to ``0``, +which lets the runtime auto-size based on the host. + TaskHandle Class ---------------- @@ -40,7 +44,7 @@ without a return value. import dftracer.utils as dft - rt = dft.Runtime(threads=8, python_threads=4) + rt = dft.Runtime(threads=8, io_threads=4) # Submit a Python callable h = rt.submit(lambda x, y: x + y, 3, 4, name="add") diff --git a/docs/source/api/trace_reader.rst b/docs/source/api/trace_reader.rst index ee9fc936..f63d904c 100644 --- a/docs/source/api/trace_reader.rst +++ b/docs/source/api/trace_reader.rst @@ -8,39 +8,48 @@ RocksDB store exists. TraceReader Class ----------------- -.. autoclass:: dftracer.utils.TraceReader(file_path: str, index_dir: str = '', checkpoint_size: int = 33554432, auto_build_index: bool = False, index_threshold: int = 8388608, runtime: Runtime | None = None) +.. autoclass:: dftracer.utils.TraceReader(path: str, index_dir: str = '', checkpoint_size: int = 33554432, auto_build_index: bool = False, runtime: Runtime | None = None) :members: :undoc-members: :show-inheritance: :special-members: __enter__, __exit__ +The ``path`` argument may be either a single trace file (``.pfw`` / ``.pfw.gz``) +or a directory. When a directory is given, all ``iter_*`` / ``read_*`` methods +discover ``.pfw`` and ``.pfw.gz`` files recursively and process them in parallel +on the Runtime thread pool. + Streaming Iterators ------------------- -``iter_lines()``, ``iter_raw()``, and ``iter_lines_json()`` return Python +``iter_lines()``, ``iter_raw()``, and ``iter_json()`` return Python iterators backed by a bounded producer-consumer queue. The C++ coroutine runs on the Runtime's thread pool and pushes items; Python's ``__next__`` pops. +``iter_lines()``, ``iter_raw()``, and ``read_lines()`` / ``read_raw()`` yield +``memoryview`` objects (zero-copy views over the C++ buffer). Wrap with +``bytes(mv)`` if you need an owned copy. + .. code-block:: python reader = TraceReader("trace.pfw.gz") - # Stream decoded lines + # Stream decoded lines (memoryview) for line in reader.iter_lines(): - process(line) # str + process(bytes(line)) # Stream raw byte chunks (one line per chunk) for chunk in reader.iter_raw(multi_line=False): - process(chunk) # bytes + process(chunk) # memoryview - # Stream parsed JSON objects - for obj in reader.iter_lines_json(): - print(obj["name"], obj["dur"]) # lazy JSON access + # Stream parsed JSON events (zero-copy JsonDictValue wrappers) + for obj in reader.iter_json(): + print(obj["name"], obj["dur"]) # Materialize to list - lines = reader.read_lines() # list[str] - chunks = reader.read_raw() # list[bytes] - objects = reader.read_lines_json() # list[JSON] + lines = reader.read_lines() # list[memoryview] + chunks = reader.read_raw() # list[memoryview] + objects = reader.read_json() # list[JsonDictValue] Arrow Output ------------ @@ -48,7 +57,9 @@ Arrow Output ``iter_arrow()`` and ``read_arrow()`` parse JSON events into columnar Arrow record batches using dynamic schema discovery. Each JSON key becomes a column; types are inferred from values (int64, uint64, double, string, bool). Nested -objects/arrays are serialized as JSON strings. +objects/arrays are serialized as JSON strings by default; pass +``flatten_objects=True`` to expand ``args`` into top-level columns, or +``normalize=True`` to coerce mixed-type columns into a canonical form. The returned objects implement the Arrow PyCapsule protocol (``__arrow_c_array__``) for zero-copy interchange with pyarrow, polars, and @@ -63,13 +74,44 @@ DuckDB. pa_batch = pyarrow.record_batch(batch) df = pa_batch.to_pandas() + # Single C-side stream drain via Arrow C Data Interface + stream = reader.iter_arrow_stream(batch_size=10000) + rbr = pyarrow.RecordBatchReader.from_stream(stream) + for batch in rbr: + ... + # Materialize all events as ArrowTable table = reader.read_arrow() df = table.to_pandas() # requires pyarrow df = table.to_polars() # requires polars - # With range parameters - table = reader.read_arrow(start_line=100, end_line=200) + # With range parameters and object flattening + table = reader.read_arrow(start_line=100, end_line=200, flatten_objects=True) + +Writing Arrow IPC Files +----------------------- + +``write_arrow()`` writes trace data to Arrow IPC files with optional +view-based partitioning. For finer control, ``get_view_chunks()`` returns +the candidate chunks after bloom-filter pruning, and ``write_view_chunk`` / +``write_view_chunks`` write individual or batched chunks (the batched variant +runs all chunks concurrently on the Runtime). + +.. code-block:: python + + reader = TraceReader("trace.pfw.gz") + + # Partition by predefined views + result = reader.write_arrow( + "out/", + views=["io", "compute"], + chunk_size_mb=32, + compression="zstd", + ) + + # Custom view + explicit chunk plan + info = reader.get_view_chunks({"name": "posix", "query": 'cat == "POSIX"'}) + reader.write_view_chunks(info["chunks"], "out/", view="io") File Metadata ------------- @@ -90,14 +132,14 @@ reading the full file (when a ``.dftindex`` RocksDB index store exists): for i in range(num_workers): start = i * chunk_size end = min((i + 1) * chunk_size, max_bytes) - process(reader.read_lines_json(start_byte=start, end_byte=end)) + process(reader.read_json(start_byte=start, end_byte=end)) Query Filtering --------------- All line-based reading methods (``read_lines``, ``iter_lines``, -``iter_lines_json``, ``read_lines_json``, ``iter_arrow``, ``read_arrow``) -accept an optional ``query`` parameter for event filtering: +``iter_json``, ``read_json``, ``iter_arrow``, ``iter_arrow_stream``, +``read_arrow``) accept an optional ``query`` parameter for event filtering: .. code-block:: python @@ -186,13 +228,25 @@ All reading methods accept these keyword arguments: - ``buffer_size`` -- internal buffer size in bytes (default 4 MB) - ``query`` -- query DSL string for event filtering (default None) +Streaming methods (``iter_lines``, ``iter_raw``, ``iter_json``, +``iter_arrow``, ``iter_arrow_stream``) additionally accept: + +- ``memory_budget`` -- soft cap on in-flight bytes queued from the + C++ producer (0 = default) + ``iter_raw`` and ``read_raw`` additionally accept: - ``line_aligned`` -- if True, chunks are aligned to line boundaries (default True) - ``multi_line`` -- if True, chunks may contain multiple lines (default True) -``iter_arrow`` additionally accepts: +``iter_json`` and ``read_json`` additionally accept: + +- ``batch_size`` -- events per parse batch (default 1024) + +``iter_arrow``, ``iter_arrow_stream``, and ``read_arrow`` additionally accept: - ``batch_size`` -- maximum rows per Arrow batch (default 10000) +- ``flatten_objects`` -- expand object fields into top-level columns (default False) +- ``normalize`` -- coerce mixed-type columns into a canonical form (default False) Out-of-range values are clamped to the actual file bounds (no errors thrown). diff --git a/docs/source/api/utilities.rst b/docs/source/api/utilities.rst index f2d2fbb6..ab6399b2 100644 --- a/docs/source/api/utilities.rst +++ b/docs/source/api/utilities.rst @@ -138,14 +138,17 @@ StatisticsQueryUtility ~~~~~~~~~~~~~~~~~~~~~~ Query pre-computed statistics from an indexed trace file. -When bloom/chunk statistics are not available (e.g. file was below -``index_threshold``), the utility falls back to streaming the file -sequentially and computing statistics on-the-fly. +When bloom/chunk statistics are not available, the utility falls back to +streaming the file sequentially and computing statistics on-the-fly. .. autoclass:: dftracer.utils.dftracer_utils_ext.StatisticsQueryUtility(runtime: Runtime | None = None) :members: process :undoc-members: +``process(file_path, query_type="summary", top_n=10, index_dir="")`` returns +a dict; ``query_type`` accepts ``"summary"``, ``"top_n_names"``, and other +pre-computed statistics views. + .. code-block:: python sq = StatisticsQueryUtility() @@ -160,10 +163,9 @@ StatisticsAggregatorUtility ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Aggregate statistics from a trace file. Uses pre-computed chunk -statistics from the ``.idx`` sidecar when available. When chunk -statistics are absent (e.g. file was below ``index_threshold``), -falls back to streaming the ``.pfw.gz`` line-by-line and computing -statistics on-the-fly. +statistics from the ``.dftindex`` store when available. When chunk +statistics are absent, falls back to streaming the ``.pfw.gz`` +line-by-line and computing statistics on-the-fly. .. autoclass:: dftracer.utils.dftracer_utils_ext.StatisticsAggregatorUtility(runtime: Runtime | None = None) :members: process @@ -197,10 +199,9 @@ ReorganizationPlannerUtility ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Plan semantic reorganization of trace files. When manifest data is -available in the ``.idx`` sidecar, produces per-checkpoint extraction -tasks. When manifest tables are absent (e.g. file was below -``index_threshold``), falls back to streaming the file line-by-line -and emitting one whole-file extraction task per query group. +available in the ``.dftindex`` store, produces per-checkpoint extraction +tasks. When manifest tables are absent, falls back to streaming the file +line-by-line and emitting one whole-file extraction task per query group. .. autoclass:: dftracer.utils.dftracer_utils_ext.ReorganizationPlannerUtility(runtime: Runtime | None = None) :members: process diff --git a/docs/source/call-tree.rst b/docs/source/call-tree.rst index f27d649f..eeb311f7 100644 --- a/docs/source/call-tree.rst +++ b/docs/source/call-tree.rst @@ -136,57 +136,113 @@ CallTree Serialization ------------- +Serialization moved to coroutine-based ``save_binary`` / ``save_arrow`` +free functions in ``dftracer/utils/call_tree/mpi/serializable.h``. The +legacy ``CallTree::save_to_file`` / ``save_to_json`` / ``load_from_file`` +methods have been removed; the API now exposes +``CallTree::internal_tree()`` for direct access to the underlying +``internal::CallTree`` consumed by the save/load coroutines. + **Save to binary format:** +The custom binary format uses a shared string dictionary (name, category, +arg keys / string values share storage) and preserves typed args +(``int`` / ``uint`` / ``double`` / ``bool`` instead of flattening to +strings). + .. code-block:: cpp - // Save to default path (based on input directory) - tree.save_to_file(); + #include + #include + #include + #include - // Save to custom path - tree.save_to_file("output.calltree"); + using namespace dftracer::utils; + using namespace dftracer::utils::call_tree; -**Save to JSON (Chrome Tracing / Perfetto):** + CallTree tree; + tree.load_from_directory("/path/to/traces", "*.pfw.gz"); + tree.generate(); -.. code-block:: cpp + auto task = make_task( + [&tree](CoroScope& scope) -> coro::CoroTask { + co_await save_binary(&scope, tree.internal_tree(), + "output.calltree"); + co_return; + }, + "save_binary"); + + Pipeline pipeline(PipelineConfig().with_name("calltree-save")); + pipeline.set_source({task}); + pipeline.execute(); - // Compatible with chrome://tracing and Perfetto UI - tree.save_to_json("output.pfw"); +**Save to Arrow IPC (.arrow):** -**Save to text file:** +Columnar Arrow IPC with buffer-level zstd compression and +dictionary-encoded ``name`` / ``category`` columns. Readable directly by +``pyarrow``, ``polars``, ``nanoarrow``, and DuckDB. Requires the build to +be configured with ``DFTRACER_UTILS_ENABLE_ARROW_IPC=ON``. .. code-block:: cpp - tree.print_depth_first_to_file("output.txt", 5); // Max depth 5 + auto task = make_task( + [&tree](CoroScope& scope) -> coro::CoroTask { + co_await save_arrow(&scope, tree.internal_tree(), + "output.arrow"); + co_return; + }, + "save_arrow"); + +**Load a previously saved tree:** -**Load from previously saved file:** +Both loaders are coroutines that return a fresh ``internal::CallTree``: .. code-block:: cpp - CallTree loaded_tree; - loaded_tree.load_from_file("output.calltree"); + auto task = make_task([](CoroScope& scope) -> coro::CoroTask { + auto loaded = co_await load_binary(&scope, "output.calltree"); + // or: auto loaded = co_await load_arrow(&scope, "output.arrow"); + printf("Loaded tree: %zu nodes\n", loaded->num_nodes()); + co_return; + }, "load"); - auto stats = loaded_tree.get_statistics(); - printf("Loaded tree: %zu nodes, %zu levels\n", - stats.total_nodes, stats.num_levels); +**Save to text file (still available on the high-level API):** + +.. code-block:: cpp + + tree.print_depth_first_to_file("output.txt", 5); // Max depth 5 Output Formats -------------- -Binary Format -~~~~~~~~~~~~~ +Binary Format (``.calltree``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Compact custom format with a global string dictionary and typed args; best +for round-tripping trees between dftracer-utils runs (for example, between +a coordinator and downstream MPI ranks). Backed by the +``CALLTREE_BINARY_VERSION = 2`` header. + +Arrow IPC (``.arrow``) +~~~~~~~~~~~~~~~~~~~~~~ -Efficient binary serialization preserving all call tree information including node hierarchy, timing, function names, categories, and arguments. +Columnar Arrow IPC file with zstd buffer compression and +dictionary-encoded ``name`` / ``category`` columns. Best for analysis +pipelines that already speak Arrow (pyarrow, polars, DuckDB, nanoarrow). JSON Format (Chrome Tracing) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Follows the Chrome Tracing format specification, viewable in ``chrome://tracing`` or `Perfetto UI `_. Shows timeline of function calls with nested relationships and duration. +The ``dftracer_call_tree`` CLI emits Chrome Tracing JSON (gzipped with +``--gzip``) suitable for ``chrome://tracing`` and +`Perfetto UI `_. Programmatic JSON export is no +longer exposed on the ``CallTree`` C++ API. Text Format ~~~~~~~~~~~ -Human-readable text with indentation showing hierarchical structure, function names, categories, and timing at each level. +Human-readable text with indentation showing hierarchical structure, +function names, categories, and timing at each level. Performance Considerations -------------------------- diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 8dd090a5..e4b74464 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -3,6 +3,57 @@ Command-Line Tools DFTracer Utils provides several command-line utilities for working with DFTracer trace files and compressed archives. +.. _cli-shared-flags: + +Shared CLI Flags +---------------- + +Most tools wire in a common set of argument schemas defined in +``src/dftracer/utils/binaries/common_cli.h``. The flags below have identical +semantics across every binary that exposes the relevant schema and are not +repeated in each tool's section. + +**Pipeline (``PipelineArgs``)** + +- ``--executor-threads `` - Number of worker threads for parallel + processing (default: number of CPU cores) +- ``--io-threads `` - Number of I/O threads (default: number of CPU + cores) +- ``--time-profiling`` - Print stage timing breakdown to stderr + +**Indexing (``IndexingArgs``)** + +- ``--index-dir `` - Directory for ``.dftindex`` stores +- ``--checkpoint-size `` - Checkpoint size for gzip indexing in bytes + (default: 33554432 B / 32 MB) +- ``-f, --force`` - Force index recreation + +**Query (``QueryArgs``)** + +- ``--query `` - Query DSL filter + (e.g., ``'cat == "POSIX" and dur > 1000'``) + +**Watchdog (``WatchdogArgs``)** + +- ``--disable-watchdog`` - Disable watchdog for hang detection +- ``--watchdog-global-timeout `` - Watchdog global timeout for pipeline + execution in seconds (0 = no timeout, default: 0) +- ``--watchdog-task-timeout `` - Watchdog default task timeout in seconds + (0 = no timeout, default: 0) +- ``--watchdog-interval `` - Watchdog check interval in seconds + (default: 1) +- ``--watchdog-warning-threshold `` - Watchdog long-running task warning + threshold in seconds (default: 300) +- ``--watchdog-idle-timeout `` - Watchdog idle timeout in seconds + (0 = use default, default: 300) +- ``--watchdog-deadlock-timeout `` - Watchdog deadlock timeout in seconds + (0 = use default, default: 600) + +**Inputs (``DirectoryArgs`` / ``FilesArgs``)** + +- ``-d, --directory `` - Directory containing trace files +- ``--files `` - Trace files (``.pfw``, ``.pfw.gz``) + dftracer_reader --------------- @@ -357,6 +408,12 @@ dftracer_index - ``--false-positive-rate `` - Bloom filter false positive rate (default: 0.01) - ``--read-batch-size `` - Batch read size in MB for stream processing (default: 4) - ``--manifest`` - Also build manifest tables in .idx (per-checkpoint event line routing) +- ``--rebuild-summaries`` - Rebuild ``ROOT_*`` aggregated summaries after ingest. + Off by default; ``ROOT_*`` CFs are only consumed by summary tools such as + ``dftracer_info``. Bloom-filter chunk-skipping queries do not require them. + +This binary also accepts the shared :ref:`cli-shared-flags` (Pipeline, +Watchdog, Indexing). **Example:** @@ -771,3 +828,108 @@ dftracer_comparator } ] } + +dftracer_aggregator_mpi +----------------------- + +**Description:** MPI driver for the distributed-SST aggregator. Each rank +produces per-rank aggregation SSTs; rank 0 bulk-ingests and the ranks jointly +write the final gzip JSON output. Requires the build to be configured with +``DFTRACER_UTILS_ENABLE_MPI=ON``. + +The pipeline is structured as a five-task DAG executed inside the standard +``Pipeline`` runtime: + +``scan -> phase_a -> phase_b -> phase_c -> merge`` + +- **scan** - Cooperative gzip-member pre-scan, ``Allgatherv`` of the member + map, and deterministic Longest-Processing-Time (LPT) assignment of work + units to ranks. +- **phase_a** - Each rank runs the distributed-SST indexer + aggregation + visitor on its slice and writes SSTs (and ``tracker.bin``) to its rank + staging directory. SSTs are optionally moved to a shared-FS staging root + for the coordinator. +- **phase_b** - Rank 0 ``Gatherv`` of artifact lists and a single + ``IndexDatabase::bulk_ingest`` + tracker merge. +- **phase_c** - Each rank writes a shard-prefixed Perfetto gzip JSON slice + using ``PerfettoTraceWriterUtility``. +- **merge** - Parallel ``pwrite`` on Lustre-striped output or serial + concatenation otherwise. + +**Usage:** + +.. code-block:: bash + + mpirun -n dftracer_aggregator_mpi [OPTIONS] + +**Options:** + +- ``-d, --directory `` - Input directory containing .pfw or .pfw.gz + files (default: ``.``) +- ``-o, --output `` - Output gzip JSON path. ``.gz`` is appended if + missing (default: ``aggregated_output.json.gz``) +- ``-t, --time-interval `` - Time interval in milliseconds for bucketing + (default: 5000) +- ``--staging-dir `` - Per-rank SST staging root. Defaults to + ``/_staging``; each rank writes to ``/rank_``. +- ``--shared-staging `` - Shared-FS staging root. When set and + different from ``--staging-dir``, each rank moves its SSTs and + ``tracker.bin`` from the (node-local) staging dir to + ``/rank_`` before the coordinator ingest. Required for + multi-node runs where ``--staging-dir`` points at node-local NVMe. +- ``--keep-staging`` - Keep per-rank SST staging dirs after a successful + ingest + +This binary also accepts the shared :ref:`cli-shared-flags` (Pipeline and +Indexing schemas). Per-rank ``--executor-threads`` / ``--io-threads`` are +automatically scaled down by the detected processes-per-node count so +co-located ranks do not oversubscribe cores. + +**Example:** + +.. code-block:: bash + + # 16 ranks on one node, node-local staging + mpirun -n 16 dftracer_aggregator_mpi -d ./traces -o agg.json.gz + + # Multi-node run with shared staging on Lustre + mpirun -n 64 dftracer_aggregator_mpi -d /lustre/traces \ + --staging-dir /local/nvme/_staging \ + --shared-staging /lustre/scratch/_staging \ + -o /lustre/out/agg.json.gz + +dftracer_call_tree_mpi +---------------------- + +**Description:** MPI driver for parallel call-tree construction. Each rank +owns a slice of PIDs, emits a Chrome Tracing JSON shard, and rank 0 merges +the shards. Wraps the ``MPICallTreeBuilder`` engine +(``discover_pids -> build -> hierarchy -> write -> merge`` coro phases). +Requires ``DFTRACER_UTILS_ENABLE_MPI=ON``. + +**Usage:** + +.. code-block:: bash + + mpirun -n dftracer_call_tree_mpi [OPTIONS] + +**Options:** + +- ``input`` - Input directory containing trace files [required] +- ``-o, --output `` - Output JSON path (default: ``call_tree.pfw``) +- ``--staging-dir `` - Shared-FS staging root for per-rank shards + (default: ``.shards/``) +- ``--gzip`` - gzip the merged output (``.gz`` appended if needed) +- ``-v, --verbose`` - Verbose progress logging +- ``--keep-staging`` - Keep per-rank shard files after merge + +This binary also accepts the shared :ref:`cli-shared-flags` (Pipeline); +per-rank thread counts are scaled down by the detected processes-per-node +count. + +**Example:** + +.. code-block:: bash + + # 32 ranks across nodes; gzip merged output + mpirun -n 32 dftracer_call_tree_mpi ./traces -o call_tree.pfw --gzip diff --git a/docs/source/conf.py b/docs/source/conf.py index e46e120a..687f4718 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -62,77 +62,116 @@ def _install_rtd_extension_stub() -> None: return ext = types.ModuleType(ext_name) - JSONPrimitive = str | int | float | bool | None class _BaseNative: """RTD stub for native extension classes.""" pass - class TaskHandle(_BaseNative): - """Handle to a submitted task. + class _ArrowBatchCapsule(_BaseNative): + """Internal Arrow batch wrapper implementing __arrow_c_array__ protocol.""" - Returned by asynchronous utility calls and by - :class:`dftracer.utils.Runtime`. The handle can be waited on, - queried for completion, or used to fetch the task result. - """ + @property + def num_rows(self) -> int: + return 0 - name = "" - task_id = 0 + @property + def num_columns(self) -> int: + return 0 - def get(self) -> object | None: - """Block until the task completes and return its result.""" - return None + def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]: + return (None, None) - def wait(self) -> None: - """Block until the task completes.""" + class _ArrowBatchStream(_BaseNative): + """Zero-iteration Arrow stream backed by the C++ coroutine channel. + + Implements the Arrow C Data Interface stream protocol. Pass directly + to ``pyarrow.RecordBatchReader.from_stream()`` or ``pyarrow.table()``. + Single-use: consuming ``__arrow_c_stream__`` once exhausts the object. + """ + + def __arrow_c_stream__(self, requested_schema: object = None) -> object: return None - def done(self) -> bool: - """Return ``True`` when the task has completed.""" - return True + class JsonDictValue(_BaseNative): + """Zero-copy wrapper over a parsed DFTracer JSON event. + + Supports dict-like access: ``event['name']``, ``event['args']['ret']``. + Call ``.to_dict()`` to materialize a regular Python dict. + """ + + def __getitem__(self, key: str) -> object: + raise KeyError(key) + + def __len__(self) -> int: + return 0 + + def __contains__(self, key: str) -> bool: + return False + + def keys(self) -> list[str]: + return [] + + def values(self) -> list[object]: + return [] + + def items(self) -> list[tuple[str, object]]: + return [] + + def get(self, key: str, default: object = None) -> object: + return default + + def to_dict(self) -> dict[str, object]: + return {} + + class IndexerCheckpoint(_BaseNative): + """Information about a checkpoint in the index.""" + + checkpoint_idx = 0 + uc_offset = 0 + uc_size = 0 + c_offset = 0 + c_size = 0 + bits = 0 + num_lines = 0 class Runtime(_BaseNative): - """Lightweight task runtime for native and Python work. + """Lightweight coroutine runtime wrapping Executor + Watchdog. - The runtime owns the executor threads used by coroutine-backed - readers, indexers, and utilities. The higher-level Python - wrapper in :mod:`dftracer.utils.runtime` builds on this native - object to support Python callables and richer task tracking. + Note: For user-facing API, use dftracer.utils.Runtime (Python wrapper) + which adds submit(), Python callable support, and error handling. """ - threads = 0 - - def __init__(self, threads: int = 0) -> None: - """Create a runtime with an optional worker-thread count.""" - super().__init__(threads) - self.threads = threads + def __init__(self, threads: int = 0, io_threads: int = 0) -> None: + self._threads = threads + self._io_threads = io_threads def shutdown(self) -> None: - """Stop the runtime and release worker resources.""" return None def wait_all(self) -> None: - """Block until all submitted native work completes.""" return None def get_progress(self) -> dict[str, object]: - """Return runtime progress metadata.""" return {} def is_responsive(self) -> bool: - """Return whether the watchdog still considers the runtime healthy.""" return True def set_timeout(self, global_ms: int = 0) -> None: - """Set a global watchdog timeout in milliseconds.""" return None def set_default_task_timeout(self, ms: int = 0) -> None: - """Set the default per-task timeout in milliseconds.""" return None + @property + def threads(self) -> int: + return self._threads + + @property + def io_threads(self) -> int: + return self._io_threads + def __enter__(self) -> "Runtime": - """Enter the runtime context manager.""" return self def __exit__( @@ -141,31 +180,192 @@ def __exit__( exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> None: - """Exit the runtime context manager.""" return None - class IndexerCheckpoint(_BaseNative): - """Information about a single checkpoint in a ``.dftindex`` store. + class Indexer(_BaseNative): + """Indexer with resolve/build pattern for tiered indexing.""" - Checkpoints map compressed and uncompressed offsets and carry - per-chunk metadata used for seeking and chunk-level pruning. - """ + def __init__( + self, + directory: str = "", + files: list[str] | None = None, + index_dir: str = "", + require_checkpoint: bool = True, + require_bloom: bool = True, + require_manifest: bool = True, + require_aggregation: bool = False, + time_interval_ms: float = 5000.0, + group_keys: list[str] | None = None, + custom_metric_fields: list[str] | None = None, + compute_percentiles: bool = False, + checkpoint_size: int = 32 * 1024 * 1024, + parallelism: int = 0, + force_rebuild: bool = False, + runtime: Runtime | None = None, + ) -> None: + """Create an indexer for trace files. + + At least one of 'directory' or 'files' must be provided. + + Args: + directory: Path to the directory containing trace files. + files: List of specific file paths to index. + index_dir: Directory for `.dftindex` stores. If empty, uses + directory-local paths. + require_checkpoint: If True, build checkpoint index (tier 1). + require_bloom: If True, build bloom filter data (tier 2). + require_manifest: If True, build manifest data (tier 2). + require_aggregation: If True, build aggregation data (tier 3). + time_interval_ms: Time interval for aggregation in milliseconds. + group_keys: Keys to group by for aggregation. + custom_metric_fields: Custom metric fields for aggregation. + compute_percentiles: If True, compute percentiles during aggregation. + parallelism: Number of parallel indexers. 0 = auto. + force_rebuild: If True, rebuild indices even if they exist. + runtime: Runtime instance for thread pool control. + """ + return None - checkpoint_idx = 0 - uc_offset = 0 - uc_size = 0 - c_offset = 0 - c_size = 0 - bits = 0 - num_lines = 0 + def resolve(self) -> dict[str, object]: + """Resolve which files need indexing. - class Indexer(_BaseNative): - """Build and query a root-local ``.dftindex`` RocksDB store. + Returns: + Dictionary with 'ready' and 'needs_work' file lists. + """ + return {} - The indexer extracts checkpoints and optional bloom/manifest - data for a compressed DFTracer trace. Readers and higher-level - utilities use this store for chunk pruning and random access. - """ + def build(self) -> dict[str, object]: + """Build indices for files that need work. + + Returns: + Dictionary with build status and statistics. + """ + return {} + + def ensure_indexed(self) -> dict[str, object]: + """Ensure all files are indexed by calling resolve then build if needed. + + Returns: + Dictionary with 'ready' and 'needs_work' file lists after indexing. + """ + return {} + + def get_checkpoint_indexer(self, file_path: str) -> "CheckpointIndexer": + """Get a checkpoint indexer for a specific file. + + Args: + file_path: Path to the trace file (.pfw/.pfw.gz). + + Returns: + CheckpointIndexer instance for checkpoint-level operations. + """ + return CheckpointIndexer(file_path) + + def get_hash_table(self, hash_type: str) -> dict[str, str]: + """Get hash table mapping hash values to original strings. + + Args: + hash_type: Type of hash table ('file', 'host', or 'string'). + + Returns: + Dict mapping hash strings to original values. + + Raises: + ValueError: If hash_type is not valid. + """ + return {} + + def query_file_pids(self, file_id: int) -> set: + """Query PIDs observed in a specific file. + + Args: + file_id: File identifier (0-based index). + + Returns: + Set of PIDs (int) observed in the file. + """ + return set() + + def query_all_file_pids(self) -> dict[int, set]: + """Query all file-to-PIDs mappings. + + Returns: + Dict mapping file_id to set of PIDs observed in that file. + """ + return {} + + def query_file_info(self) -> tuple[dict[int, str], dict[int, set]]: + """Query file ID to path mapping and per-file PIDs in one call. + + Returns: + Tuple of (file_id_to_path, file_pids). + """ + return ({}, {}) + + def iter_aggregation( + self, + type: str = "events", + batch_size: int = 10000, + ) -> Iterator[object]: + """Iterate over aggregation data as Arrow batches. + + Args: + type: 'events', 'profiles', or 'system' + batch_size: Number of entries per batch (default 10000) + + Returns: + Iterator over Arrow batch capsules. + """ + return iter(()) + + def iter_arrow_dfanalyzer( + self, + type: str = "events", + batch_size: int = 10000, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + query: str | None = None, + ) -> Iterator[object]: + """Iterate over aggregation data as dfanalyzer-compatible Arrow batches. + + Args: + type: 'events', 'profiles', or 'system' + batch_size: Number of entries per batch (default 10000) + time_granularity: Bucket width in seconds (default 1.0) + time_resolution: Microseconds per output time unit (default 1e6) + query: Optional query filter (e.g., "pid == 1234 or pid == 5678") + + Returns: + Iterator over Arrow batch capsules with dfanalyzer schema. + """ + return iter(()) + + def iter_arrow_dfanalyzer_all( + self, + batch_size: int = 10000, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + query: str | None = None, + group_by: list[str] | None = None, + ) -> dict[str, list[object]]: + """Iterate over all aggregation types in a single scan. + + Args: + batch_size: Number of entries per batch (default 10000) + time_granularity: Bucket width in seconds (default 1.0) + time_resolution: Microseconds per output time unit (default 1e6) + query: Optional query filter (e.g., "pid == 1234 or pid == 5678") + group_by: Optional list of columns to group by for coarse in-scan + aggregation. When provided, output schema is reduced to the + requested group columns plus aggregated metrics. + + Returns: + Dict with 'events', 'profiles', 'system' keys containing Arrow batches. + """ + return {"events": [], "profiles": [], "system": []} + + class CheckpointIndexer(_BaseNative): + """Checkpoint indexer for single-file checkpoint-level operations.""" def __init__( self, @@ -175,50 +375,90 @@ def __init__( force_rebuild: bool = False, build_bloom: bool = False, build_manifest: bool = False, - index_threshold: int = 8388608, runtime: Runtime | None = None, ) -> None: - """Create an indexer for a compressed DFTracer trace.""" - self.gz_path = gz_path - self.index_path = index_path or "" - self.checkpoint_size = checkpoint_size - self.has_bloom = build_bloom - self.has_manifest = build_manifest + """Create a checkpoint indexer for a gzip file. + + Args: + gz_path: Path to the gzip trace file. + index_path: Path to the `.dftindex` store. If None, uses the + root-local `.dftindex` next to ``gz_path``. + checkpoint_size: Checkpoint size in bytes for index building. + force_rebuild: If True, rebuild the index even if it exists. + build_bloom: If True, build bloom filter data in the index. + build_manifest: If True, build manifest data in the index. + runtime: Runtime instance for thread pool control. + If None, uses the default global Runtime. + """ + self._gz_path = gz_path + self._index_path = index_path or "" + self._checkpoint_size = checkpoint_size + self._has_bloom = build_bloom + self._has_manifest = build_manifest def build(self) -> None: - """Build the index store for the configured trace file.""" + """Build the index.""" return None def need_rebuild(self) -> bool: - """Return whether the index is missing or stale.""" + """Check if index needs rebuilding.""" return False def exists(self) -> bool: - """Return whether the index store already exists.""" + """Check if the `.dftindex` store exists.""" return False def get_max_bytes(self) -> int: - """Return the maximum decompressed byte position in the trace.""" + """Get maximum byte position.""" return 0 def get_num_lines(self) -> int: - """Return the number of lines recorded in the index.""" + """Get number of lines.""" return 0 - def get_checkpoints(self) -> list["IndexerCheckpoint"]: - """Return all checkpoints stored for the trace.""" + def get_checkpoints(self) -> list[IndexerCheckpoint]: + """Get all checkpoints.""" return [] - def find_checkpoint(self, target_offset: int) -> "IndexerCheckpoint | None": - """Return the checkpoint closest to a decompressed offset.""" + def find_checkpoint(self, target_offset: int) -> IndexerCheckpoint | None: + """Find checkpoint for target offset.""" return None def close(self) -> None: - """Release this Python wrapper's native indexer handle.""" + """Release this Python wrapper's native indexer handle. + + This does not force-close the shared RocksDB instance for the same + ``.dftindex`` path. + """ return None - def __enter__(self) -> "Indexer": - """Enter the indexer context manager.""" + @property + def gz_path(self) -> str: + """Get gzip path.""" + return self._gz_path + + @property + def index_path(self) -> str: + """Get the `.dftindex` path.""" + return self._index_path + + @property + def checkpoint_size(self) -> int: + """Get checkpoint size.""" + return self._checkpoint_size + + @property + def has_bloom(self) -> bool: + """Whether bloom filter data exists in the `.dftindex` store.""" + return self._has_bloom + + @property + def has_manifest(self) -> bool: + """Whether manifest data exists in the `.dftindex` store.""" + return self._has_manifest + + def __enter__(self) -> "CheckpointIndexer": + """Enter the runtime context for the with statement.""" return self def __exit__( @@ -227,97 +467,70 @@ def __exit__( exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> None: - """Exit the indexer context manager.""" - return None - - class JSON(_BaseNative): - """Lazy JSON wrapper backed by yyjson. + """Release this Python wrapper on context exit. - Nested objects are exposed as additional :class:`JSON` wrappers - so callers can inspect large trace records without eagerly - converting the entire payload to Python dictionaries. - """ - - def __init__(self, json_str: str) -> None: - """Create a lazy JSON wrapper from a JSON string.""" - self._json_str = json_str - - def get( - self, - key: str, - default: "JSON | JSONPrimitive" = None, - ) -> "JSON | JSONPrimitive": - """Look up a key and return ``default`` when it is absent.""" - return default - - def keys(self) -> list[str]: - """Return the keys in the current JSON object.""" - return [] - - def values(self) -> list["JSON | JSONPrimitive"]: - """Return the values in the current JSON object.""" - return [] + This does not force-close the shared RocksDB instance for the same + ``.dftindex`` path. + """ + return None - def items(self) -> list[tuple[str, "JSON | JSONPrimitive"]]: - """Return key-value pairs in the current JSON object.""" - return [] + class TaskHandle(_BaseNative): + """Handle to a submitted C++ coroutine task.""" - def unwrap(self) -> dict[str, object] | list[object] | JSONPrimitive: - """Convert the lazy wrapper into native Python data.""" - return {} + def get(self) -> object: + """Block until task completes and return result. Raises on error.""" + return None - def copy(self) -> "JSON": - """Return a shallow copy of the current lazy JSON wrapper.""" - return self + def wait(self) -> None: + """Block until task completes. Raises on error.""" + return None - def __contains__(self, key: str) -> bool: - """Return ``True`` when a key exists in the object.""" - return False + def done(self) -> bool: + """Return True if task has completed.""" + return True - def __getitem__(self, key: str) -> "JSON | JSONPrimitive": - """Return a field value or nested :class:`JSON` wrapper.""" - raise KeyError(key) + @property + def name(self) -> str: + """Task name.""" + return "" - def __len__(self) -> int: - """Return the number of items in the current JSON object.""" + @property + def task_id(self) -> int: + """Task identifier.""" return 0 - def __bool__(self) -> bool: - """Return whether the current JSON value is non-empty.""" - return False - - def __str__(self) -> str: - """Return a JSON-like string representation.""" - return "{}" - - def __repr__(self) -> str: - """Return a developer-facing representation.""" - return "JSON('{}')" - class TraceReader(_BaseNative): - """Read DFTracer traces with optional index-assisted pruning. - - ``TraceReader`` chooses between sequential and indexed access - based on the file format and the presence of a shared - root-local ``.dftindex`` store. It exposes line, raw-byte, - JSON, and Arrow-based views over the same trace data. - """ + """Smart trace file reader that auto-selects sequential vs indexed reading.""" def __init__( self, - file_path: str, + path: str, index_dir: str = "", checkpoint_size: int = 33554432, auto_build_index: bool = False, - index_threshold: int = 8388608, - runtime: Runtime | None = None, + runtime: Runtime | object | None = None, ) -> None: - """Create a trace reader for plain or compressed DFTracer files.""" - self.file_path = file_path - self.index_dir = index_dir - self.checkpoint_size = checkpoint_size - self.auto_build_index = auto_build_index - self.index_threshold = index_threshold + """Create a TraceReader. + + Args: + path: Path to a trace file (.pfw/.pfw.gz) or a directory. + When a directory is given, all iter/read methods discover + .pfw and .pfw.gz files recursively and process them in + parallel on the Runtime thread pool. + index_dir: Directory to search for ``.dftindex`` stores. + Empty string (default) searches next to the trace file. + checkpoint_size: Checkpoint interval in bytes for index + building (default 32 MB). + auto_build_index: If True, automatically build an index + when none exists. + runtime: Runtime instance for thread pool control. + If None, uses the default global Runtime. + + Raises: + RuntimeError: If *file_path* does not exist or cannot be opened. + """ + self._path = path + self._index_dir = index_dir def read_lines( self, @@ -327,10 +540,12 @@ def read_lines( end_byte: int = 0, buffer_size: int = 4194304, query: str | None = None, - ) -> list[str]: - """Materialize decoded lines into a Python list. + ) -> list[memoryview]: + """Read lines from the trace file and return as a list. - Supports optional line/byte ranges and query-based filtering. + Lines are 1-indexed. Pass ``start_line=0, end_line=0`` (the + defaults) to read all lines. Out-of-range values are clamped + to the actual file bounds. """ return [] @@ -342,67 +557,84 @@ def iter_lines( end_byte: int = 0, buffer_size: int = 4194304, query: str | None = None, - ) -> Iterator[str]: - """Stream decoded lines from the trace. + memory_budget: int = 0, + ) -> Iterator[memoryview]: + """Return a streaming iterator over decoded lines. - The returned iterator yields one UTF-8 decoded line at a time. + The C++ coroutine runs on the Runtime thread pool and pushes + lines into a bounded queue; Python ``__next__`` pops from it. """ return iter(()) - def iter_raw( + def iter_json( self, start_line: int = 0, end_line: int = 0, start_byte: int = 0, end_byte: int = 0, - line_aligned: bool = True, - multi_line: bool = True, buffer_size: int = 4194304, query: str | None = None, - ) -> Iterator[bytes]: - """Stream raw byte chunks from the trace. + batch_size: int = 1024, + memory_budget: int = 0, + ) -> Iterator["JsonDictValue"]: + """Return a streaming iterator over parsed JSON events. - Byte-range reads can be aligned to line boundaries and can - optionally return multi-line chunks. + Each event is parsed once in C++ and yielded as a zero-copy + :class:`JsonDictValue` wrapper. No double-parsing overhead. """ return iter(()) - def read_raw( + def read_json( self, start_line: int = 0, end_line: int = 0, start_byte: int = 0, end_byte: int = 0, - line_aligned: bool = True, - multi_line: bool = True, buffer_size: int = 4194304, query: str | None = None, - ) -> list[bytes]: - """Materialize raw byte chunks into a Python list.""" + batch_size: int = 1024, + ) -> list["JsonDictValue"]: + """Read all events as parsed :class:`JsonDictValue` wrappers (list). + + Equivalent to ``list(iter_json(...))``. + """ return [] - def iter_lines_json( + def iter_raw( self, start_line: int = 0, end_line: int = 0, start_byte: int = 0, end_byte: int = 0, + line_aligned: bool = True, + multi_line: bool = True, buffer_size: int = 4194304, query: str | None = None, - ) -> Iterator["JSON"]: - """Stream lazy :class:`JSON` objects for trace events.""" + memory_budget: int = 0, + ) -> Iterator[memoryview]: + """Return a streaming iterator over raw byte chunks. + + When ``query`` is set and an index exists, chunk-level pruning + skips non-matching chunks. No per-event filtering is applied. + """ return iter(()) - def read_lines_json( + def read_raw( self, start_line: int = 0, end_line: int = 0, start_byte: int = 0, end_byte: int = 0, + line_aligned: bool = True, + multi_line: bool = True, buffer_size: int = 4194304, query: str | None = None, - ) -> list["JSON"]: - """Materialize trace events as lazy :class:`JSON` objects.""" + ) -> list[memoryview]: + """Read raw byte chunks and return as a list. + + When ``query`` is set and an index exists, chunk-level pruning + skips non-matching chunks. No per-event filtering is applied. + """ return [] def iter_arrow( @@ -414,10 +646,45 @@ def iter_arrow( end_byte: int = 0, buffer_size: int = 4194304, query: str | None = None, - ) -> Iterator["ArrowBatch"]: - """Stream Arrow batches parsed from trace events.""" + flatten_objects: bool = False, + normalize: bool = False, + memory_budget: int = 0, + ) -> Iterator["_ArrowBatchCapsule"]: + """Return iterator over Arrow record batches. + + Each batch is an ``_ArrowBatchCapsule`` implementing the Arrow + PyCapsule protocol (``__arrow_c_array__``). Wrap with + :class:`~dftracer.utils.arrow.ArrowBatch` for convenience + methods, or pass directly to ``pyarrow.record_batch()``. + """ return iter(()) + def iter_arrow_stream( + self, + batch_size: int = 10000, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: str | None = None, + flatten_objects: bool = False, + normalize: bool = False, + memory_budget: int = 0, + ) -> "_ArrowBatchStream": + """Return an Arrow C Data Interface stream over record batches. + + PyArrow can drain the producer channel in a single C-side call: + + rbr = pa.RecordBatchReader.from_stream(reader.iter_arrow_stream()) + for batch in rbr: + ... + + Equivalent data to :meth:`iter_arrow`, but without per-batch + Python <-> C transitions. + """ + return _ArrowBatchStream() + def read_arrow( self, batch_size: int = 10000, @@ -427,20 +694,144 @@ def read_arrow( end_byte: int = 0, buffer_size: int = 4194304, query: str | None = None, - ) -> "ArrowTable | None": - """Materialize Arrow batches as a single table-like result.""" + flatten_objects: bool = False, + normalize: bool = False, + ) -> object: + """Read all events as an ArrowTable. + + Equivalent to collecting all batches from :meth:`iter_arrow` + into an :class:`~dftracer.utils.arrow.ArrowTable`. + """ return None def get_max_bytes(self) -> int: - """Return indexed decompressed size when available.""" + """Get the maximum byte position in the decompressed trace. + + Returns the decompressed size for indexed files, file size for + plain text files, or 0 for compressed files without an index. + """ return 0 def get_num_lines(self) -> int: - """Return indexed line count when available.""" + """Get the total number of lines in the trace. + + Returns the line count for indexed files, or 0 for files + without an index (use :attr:`num_lines` property for fallback + counting). + """ return 0 + @property + def path(self) -> str: + """Path to the trace file or directory.""" + return self._path + + @property + def index_dir(self) -> str: + """Directory searched for `.dftindex` stores.""" + return self._index_dir + + @property + def has_index(self) -> bool: + """True if a checkpoint index was found at construction time.""" + return False + + @property + def num_lines(self) -> int: + """Total line count (reads all lines to compute if needed).""" + return 0 + + def write_arrow( + self, + path: str, + views: list[str | dict[str, object]] | None = None, + chunk_size_mb: int = 32, + compression: str = "zstd", + batch_size: int = 10000, + ) -> dict[str, object]: + """Write trace data to Arrow IPC files with optional view-based partitioning. + + Args: + path: Output directory for Arrow IPC files. + views: List of view definitions. Each can be: + - A string: predefined view name ('io', 'compute', 'dlio') + - A dict with 'name' and optional 'query', 'include_metadata' + If None, writes all events to 'all' partition. + chunk_size_mb: Maximum uncompressed size per file in MB. + compression: 'zstd' or 'none'. + batch_size: Events per Arrow batch. + + Returns: + Dict with partitions, total_rows, total_bytes, chunks_scanned, chunks_skipped. + """ + return {} + + def get_view_chunks( + self, + view: str | dict[str, object] | None = None, + ) -> dict[str, object]: + """Get candidate chunks for a view after bloom filter pruning. + + Args: + view: View definition (string or dict with 'name' and optional 'query'). + + Returns: + Dict with chunks list, total_checkpoints, skipped_checkpoints, file_may_match. + """ + return {} + + def write_view_chunk( + self, + output_file: str, + checkpoint_idx: int, + start_byte: int, + end_byte: int, + view: str | dict[str, object] | None = None, + compression: str = "zstd", + batch_size: int = 10000, + ) -> dict[str, object]: + """Write a single chunk to an Arrow IPC file. + + Args: + output_file: Path to output Arrow IPC file. + checkpoint_idx: Checkpoint index. + start_byte: Start byte offset. + end_byte: End byte offset. + view: View definition. + compression: 'zstd' or 'none'. + batch_size: Events per batch. + + Returns: + Dict with output_file, events_matched, rows_written, bytes_written. + """ + return {} + + def write_view_chunks( + self, + chunks: list[dict[str, object]], + output_dir: str, + view: str | dict[str, object] | None = None, + compression: str = "zstd", + batch_size: int = 10000, + ) -> dict[str, object]: + """Write multiple chunks to Arrow IPC files in parallel. + + All chunks are processed concurrently on the Runtime thread pool. + + Args: + chunks: List of dicts with checkpoint_idx, start_byte, end_byte. + output_dir: Directory for output Arrow IPC files. + view: View definition. + compression: 'zstd' or 'none'. + batch_size: Events per batch. + + Returns: + Dict with results list, total_rows, total_events_matched. + """ + return {} + def __enter__(self) -> "TraceReader": - """Enter the trace-reader context manager.""" + """Enter the runtime context for the with statement.""" return self def __exit__( @@ -449,49 +840,165 @@ def __exit__( exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> None: - """Exit the trace-reader context manager.""" + """Exit the runtime context for the with statement.""" return None - class AggregatorUtility(_BaseNative): - """Aggregate trace events into Arrow-ready time buckets.""" + class StatisticsQueryUtility(_BaseNative): + def __init__(self, runtime: Runtime | None = None) -> None: + self.runtime = runtime + + def process( + self, + file_path: str, + query_type: str = "summary", + top_n: int = 10, + index_dir: str = "", + ) -> dict[str, object]: + return {} + + def __call__( + self, + file_path: str, + query_type: str = "summary", + top_n: int = 10, + index_dir: str = "", + ) -> dict[str, object]: + return {} + + class StatisticsAggregatorUtility(_BaseNative): + def __init__(self, runtime: Runtime | None = None) -> None: + self.runtime = runtime + + def process( + self, + file_path: str, + index_dir: str = "", + ) -> dict[str, object]: + return {} + + def __call__( + self, + file_path: str, + index_dir: str = "", + ) -> dict[str, object]: + return {} + + class MetadataCollectorUtility(_BaseNative): + def __init__(self, runtime: Runtime | None = None) -> None: + self.runtime = runtime + + def process( + self, + file_path: str, + index_dir: str = "", + ) -> dict[str, object]: + return {} + + def __call__( + self, + file_path: str, + index_dir: str = "", + ) -> dict[str, object]: + return {} + + class ReorganizationPlannerUtility(_BaseNative): + def __init__(self, runtime: Runtime | None = None) -> None: + self.runtime = runtime + + def process( + self, + source_files: list[str], + groups: list[dict[str, str]] | None = None, + index_dir: str = "", + ) -> dict[str, object]: + return {} + + def __call__( + self, + source_files: list[str], + groups: list[dict[str, str]] | None = None, + index_dir: str = "", + ) -> dict[str, object]: + return {} + class ReconstructionPlannerUtility(_BaseNative): def __init__(self, runtime: Runtime | None = None) -> None: - """Create an aggregation utility bound to an optional runtime.""" self.runtime = runtime def process( self, - source_dir: str, - output_path: str = "", + reorganized_files: list[str], + index_dir: str = "", + ) -> dict[str, object]: + return {} + + def __call__( + self, + reorganized_files: list[str], + index_dir: str = "", + ) -> dict[str, object]: + return {} + + class AggregatorUtility(_BaseNative): + def __init__(self, runtime: Runtime | None = None) -> None: + self.runtime = runtime + + def process( + self, + directory: str, time_interval_ms: float = 5000.0, - query: str = "", + group_keys: list[str] | None = None, + categories: list[str] | None = None, + names: list[str] | None = None, + index_dir: str = "", + checkpoint_size: int = 33554432, + force_rebuild: bool = False, + chunk_size_mb: int = 64, + batch_size_mb: int = 4, + event_batch_size: int = 10000, + custom_metric_fields: list[str] | None = None, + compute_percentiles: bool = False, + ) -> object: + return None + + def __call__( + self, + directory: str, + time_interval_ms: float = 5000.0, + group_keys: list[str] | None = None, + categories: list[str] | None = None, + names: list[str] | None = None, index_dir: str = "", + checkpoint_size: int = 33554432, force_rebuild: bool = False, + chunk_size_mb: int = 64, + batch_size_mb: int = 4, + event_batch_size: int = 10000, custom_metric_fields: list[str] | None = None, compute_percentiles: bool = False, - ) -> "ArrowTable | None": - """Aggregate trace events into a materialized Arrow-style result.""" + ) -> object: return None def iter_arrow( self, - source_dir: str, - output_path: str = "", + directory: str, time_interval_ms: float = 5000.0, - query: str = "", + group_keys: list[str] | None = None, + categories: list[str] | None = None, + names: list[str] | None = None, index_dir: str = "", + checkpoint_size: int = 33554432, force_rebuild: bool = False, + chunk_size_mb: int = 64, + batch_size_mb: int = 4, + event_batch_size: int = 10000, custom_metric_fields: list[str] | None = None, compute_percentiles: bool = False, - ) -> Iterator["ArrowBatch"]: - """Stream Arrow batches for aggregated trace metrics.""" + ) -> Iterator[object]: return iter(()) class ComparatorUtility(_BaseNative): - """Compare baseline and variant traces.""" - def __init__(self, runtime: Runtime | None = None) -> None: - """Create a comparator utility bound to an optional runtime.""" self.runtime = runtime def compare( @@ -499,12 +1006,31 @@ def compare( baseline: str, variant: str, query: str = "", + group_by: str = "", + format: str = "table", time_interval_ms: float = 5000.0, threshold: float = 0.0, + executor_threads: int = 0, index_dir: str = "", force_rebuild: bool = False, - ) -> "ArrowTable | None": - """Return comparison results as Arrow-compatible output.""" + config: str = "", + ) -> object: + return None + + def __call__( + self, + baseline: str, + variant: str, + query: str = "", + group_by: str = "", + format: str = "table", + time_interval_ms: float = 5000.0, + threshold: float = 0.0, + executor_threads: int = 0, + index_dir: str = "", + force_rebuild: bool = False, + config: str = "", + ) -> object: return None def compare_json( @@ -512,12 +1038,15 @@ def compare_json( baseline: str, variant: str, query: str = "", + group_by: str = "", + format: str = "table", time_interval_ms: float = 5000.0, threshold: float = 0.0, + executor_threads: int = 0, index_dir: str = "", force_rebuild: bool = False, + config: str = "", ) -> str: - """Return comparison results as JSON.""" return "{}" def compare_table( @@ -525,115 +1054,96 @@ def compare_table( baseline: str, variant: str, query: str = "", + group_by: str = "", + format: str = "table", time_interval_ms: float = 5000.0, threshold: float = 0.0, + executor_threads: int = 0, index_dir: str = "", force_rebuild: bool = False, + config: str = "", ) -> str: - """Return comparison results as a formatted text table.""" return "" - class StatisticsQueryUtility(_BaseNative): - """Query summary or top-N statistics from a trace.""" + class IndexDatabase(_BaseNative): + """Handle to a .dftindex RocksDB store. - def __init__(self, runtime: Runtime | None = None) -> None: - """Create a statistics-query utility bound to an optional runtime.""" - self.runtime = runtime + Used by the distributed indexer coordinator to pre-register files, + reserve file_id ranges, bulk-ingest worker-produced SSTs, and rebuild + root summaries. + """ - def process( - self, - file_path: str, - query_type: str = "summary", - top_n: int = 10, - index_dir: str = "", - auto_build_index: bool = False, - index_threshold: int = 8388608, - ) -> dict[str, object]: - """Return scalar statistics derived from the trace.""" - return {} + def __init__(self, index_path: str) -> None: + self._index_path = index_path - class StatisticsAggregatorUtility(_BaseNative): - """Aggregate core statistics from a trace into a Python dictionary.""" + def init_schema(self) -> None: + return None - def __init__(self, runtime: Runtime | None = None) -> None: - """Create a statistics-aggregator utility bound to an optional runtime.""" - self.runtime = runtime + def register_files(self, paths: list[str], build_manifest: bool = False) -> list[int]: + """Register each path in the DEFAULT-CF file registry and return + the assigned file_ids (parallel to `paths`). Idempotent for files + with matching hash.""" + return [] - def process( - self, - file_path: str, - index_dir: str = "", - auto_build_index: bool = False, - index_threshold: int = 8388608, - ) -> dict[str, object]: - """Return aggregate trace statistics.""" - return {} + def reserve_file_id_range(self, count: int) -> int: + """Atomically reserve `count` contiguous file_ids; return first.""" + return 0 - class MetadataCollectorUtility(_BaseNative): - """Collect file metadata and index-aware trace metadata.""" + def bulk_ingest( + self, + registry: "SstArtifactRegistry", + skip_cfs: object = None, + ) -> None: + """Ingest all SSTs collected in the registry. + + skip_cfs is an optional iterable of CF names whose SSTs are left + outside the unified DB. Distributed builds pass + {"aggregation", "system_metrics"} to keep per-worker AGG/SYS SSTs + addressable via `agg_manifest.json` for parallel reads at analyze + time. See `dftracer.utils.dask.consolidate_index` to fold them + back into the unified DB later. + """ + return None - def __init__(self, runtime: Runtime | None = None) -> None: - """Create a metadata collector bound to an optional runtime.""" - self.runtime = runtime + def rebuild_root_summaries(self) -> None: + """Recompute ROOT_* summary column families from per-file CFs.""" + return None - def process( - self, - file_path: str, - index_dir: str = "", - checkpoint_size: int = 33554432, - force_rebuild: bool = False, - index_threshold: int = 8388608, - ) -> dict[str, object]: - """Return metadata for a DFTracer trace file.""" - return {} + def write_agg_global_config(self, time_interval_us: int, config_hash: int = 0) -> None: + """Write the aggregation global-config marker into the AGGREGATION CF. - class ReorganizationPlannerUtility(_BaseNative): - """Build a semantic reorganization plan for trace files.""" + Required for `Indexer.iter_arrow_dfanalyzer_all` on distributed + builds (which never materialise the key via worker SSTs) and + post-consolidate indices. + """ + return None - def __init__(self, runtime: Runtime | None = None) -> None: - """Create a reorganization planner bound to an optional runtime.""" - self.runtime = runtime + def write_agg_file_markers(self, file_ids: object) -> None: + """Write per-file aggregation completion markers into the AGGREGATION CF. - def process( - self, - source_files: list[str], - groups: list[dict[str, object]], - index_dir: str = "", - checkpoint_size: int = 33554432, - force_rebuild: bool = False, - index_threshold: int = 8388608, - ) -> dict[str, object]: - """Return a reorganization plan for the requested groups.""" - return {} + Each marker is ``\\xFF\\xFF + file_id_be32``. The index resolver uses + their presence to decide whether each file has aggregated data; if + missing, ``ensure_indexed()`` concludes the aggregation tier is + incomplete and re-runs the entire build. Distributed_index must + call this after ``bulk_ingest`` so subsequent ``read_trace`` calls + do not redundantly re-aggregate. + """ + return None - class ReconstructionPlannerUtility(_BaseNative): - """Build a reconstruction plan from reorganized traces.""" + def write_aggregation_tracker(self, blobs: list[bytes]) -> None: + """Merge serialized AssociationTracker blobs and write the result + to the AGGREGATION CF under the ``__tracker__`` key.""" + return None - def __init__(self, runtime: Runtime | None = None) -> None: - """Create a reconstruction planner bound to an optional runtime.""" - self.runtime = runtime + class SstArtifactRegistry(_BaseNative): + """Thread-safe collector for SST artifact paths produced by workers.""" - def process( - self, - reorganized_files: list[str], - provenance_dir: str = "", - ) -> dict[str, object]: - """Return a reconstruction plan for reorganized trace files.""" - return {} + def __init__(self) -> None: + pass - ext.Indexer = Indexer - ext.IndexerCheckpoint = IndexerCheckpoint - ext.JSON = JSON - ext.Runtime = Runtime - ext.TaskHandle = TaskHandle - ext.TraceReader = TraceReader - ext.AggregatorUtility = AggregatorUtility - ext.ComparatorUtility = ComparatorUtility - ext.MetadataCollectorUtility = MetadataCollectorUtility - ext.ReconstructionPlannerUtility = ReconstructionPlannerUtility - ext.ReorganizationPlannerUtility = ReorganizationPlannerUtility - ext.StatisticsAggregatorUtility = StatisticsAggregatorUtility - ext.StatisticsQueryUtility = StatisticsQueryUtility + def append(self, artifacts_dict: dict[str, str | None]) -> None: + """Add a per-batch Artifacts dict as returned by `build_sst_batch`.""" + return None def get_default_runtime() -> Runtime: """Return the process-wide default runtime.""" @@ -643,41 +1153,176 @@ def set_default_runtime(runtime: Runtime | None = None) -> None: """Replace or clear the process-wide default runtime.""" return None - ext.get_default_runtime = get_default_runtime - ext.set_default_runtime = set_default_runtime - for name in [ - "AggregatorUtility", - "ComparatorUtility", - "Indexer", - "IndexerCheckpoint", - "JSON", - "MetadataCollectorUtility", - "ReconstructionPlannerUtility", - "ReorganizationPlannerUtility", - "Runtime", - "StatisticsAggregatorUtility", - "StatisticsQueryUtility", - "TaskHandle", - "TraceReader", - ]: - getattr(ext, name).__module__ = ext_name - ext.__all__ = [ + def read_arrow_files_parallel( + paths: list[str], + runtime: Runtime | None = None, + ) -> dict[str, object]: + """Read multiple Arrow IPC files in parallel using the Runtime. + + Args: + paths: List of file paths to read. + runtime: Optional Runtime object. Uses default if not provided. + + Returns: + dict with: + - file_results: List of per-file results, each with: + - path: File path + - success: True if read succeeded + - error: Error message if failed, else None + - total_rows: Number of rows in file + - batches: List of ArrowBatch objects + - total_rows: Total rows across all files + - total_batches: Total batches across all files + - files_read: Number of files read successfully + - files_failed: Number of files that failed + """ + return {} + + def build_sst_batch( + files: list[str], + file_ids: list[int], + staging_dir: str, + batch_id: str, + index_dir: str = "", + checkpoint_size: int = 33554432, + build_manifest: bool = False, + force_rebuild: bool = False, + bloom_dimensions: list[str] | None = None, + parallelism: int = 0, + flush_every_files: int = 0, + runtime: Runtime | object | None = None, + aggregation_config: object = None, + file_slices: object = None, + ) -> tuple[list[dict[str, str | None]], bytes]: + """Run the indexer pipeline with an SST sink. Returns + `(artifact_dicts, tracker_blob)`. `tracker_blob` is the serialized + merged AssociationTracker for the batch (empty bytes when + `aggregation_config` is None). `file_slices` enables intra-file + parallelism; entries are `None` (whole file) or + `(member_begin, member_end, checkpoint_idx_base, + skip_file_scoped_writes, members)`.""" + return ([], b"") + + def plan_lpt_partition( + entries: list[tuple[str, int]], num_workers: int + ) -> list[list[tuple[str, int]]]: + """Greedy LPT bin-packing of (path, size) tuples into num_workers + buckets, minimising the maximum per-worker total size.""" + return [] + + def scan_files( + directory: str, + patterns: list[str] | None = None, + recursive: bool = False, + runtime: Runtime | object | None = None, + ) -> list[tuple[str, int]]: + """Parallel directory scan returning (path, size) tuples for regular + files matching the patterns.""" + return [] + + def enable_aggregation_deterministic_ids() -> None: + """Flip the global aggregation StringIntern into deterministic-id mode + so the same string maps to the same 32-bit id in every worker process.""" + return None + + def move_artifacts( + artifacts: dict[str, str | None], dest_dir: str + ) -> dict[str, str | None]: + """Move every populated SST in `artifacts` into `dest_dir` via the + C++ rename/copy helper, returning a fresh dict with the new paths.""" + return {} + + def enumerate_gzip_members( + files: list[str], + runtime: Runtime | object | None = None, + ) -> list[list[tuple[int, int]]]: + """Cooperative async scan of gzip member offsets. Returns lists of + `(c_offset, c_size)` parallel to `files`; empty for non-gzip files.""" + return [] + + def plan_work_units( + member_map: list[list[tuple[int, int]]], + num_workers: int, + target_c_size: int = 0, + ) -> list[list[tuple[int, int, int, int]]]: + """Deterministic LPT assignment of intra-file gzip-member slices across + workers. Returns per-worker lists of + `(file_idx, member_begin, member_end, c_size)`.""" + return [] + + def scan_aggregation_manifest( + agg_ssts: list[str], + sys_ssts: list[str], + scratch_dir: str, + meta_index_path: str, + batch_size: int = 10000, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + query: str | None = None, + group_by: list[str] | None = None, + shard_begin: int = 0, + shard_end: int = 4096, + runtime: Runtime | object | None = None, + file_hashes: dict[str, str] | None = None, + host_hashes: dict[str, str] | None = None, + ) -> dict[str, list["_ArrowBatchCapsule"]]: + """Scan a worker's slice of the distributed aggregation manifest. + + Ingests `agg_ssts` + `sys_ssts` into a scratch IndexDatabase at + `scratch_dir` (caller owns the directory lifecycle) and runs the + dfanalyzer aggregation scan over `[shard_begin, shard_end)`. + `meta_index_path` is the unified .dftindex used to resolve file / + host hashes. + + Returns the same dict shape as `Indexer.iter_arrow_dfanalyzer_all`: + `{"events": [...], "profiles": [...], "system": [...]}`. + """ + return {"events": [], "profiles": [], "system": []} + + _class_symbols = [ + "_ArrowBatchCapsule", + "_ArrowBatchStream", "AggregatorUtility", + "CheckpointIndexer", "ComparatorUtility", + "IndexDatabase", "Indexer", "IndexerCheckpoint", - "JSON", + "JsonDictValue", "MetadataCollectorUtility", "ReconstructionPlannerUtility", "ReorganizationPlannerUtility", "Runtime", + "SstArtifactRegistry", "StatisticsAggregatorUtility", "StatisticsQueryUtility", "TaskHandle", "TraceReader", + ] + _function_symbols = [ + "build_sst_batch", + "enable_aggregation_deterministic_ids", + "enumerate_gzip_members", "get_default_runtime", + "move_artifacts", + "plan_lpt_partition", + "plan_work_units", + "read_arrow_files_parallel", + "scan_aggregation_manifest", + "scan_files", "set_default_runtime", ] + + _local = locals() + for _name in _class_symbols + _function_symbols: + setattr(ext, _name, _local[_name]) + + for _name in _class_symbols: + getattr(ext, _name).__module__ = ext_name + for _name in _function_symbols: + getattr(ext, _name).__module__ = ext_name + + ext.__all__ = sorted(_class_symbols + _function_symbols) sys.modules[ext_name] = ext diff --git a/docs/source/cpp_api/arrow.rst b/docs/source/cpp_api/arrow.rst index 5e9d946c..f09ae7e6 100644 --- a/docs/source/cpp_api/arrow.rst +++ b/docs/source/cpp_api/arrow.rst @@ -25,11 +25,20 @@ Guarded by ``DFTRACER_UTILS_ENABLE_ARROW`` (ON by default). subgraph Write["File Output"] IPC["IpcWriter"] + PW["PartitionWriter"] + PR["PartitionRouter"] + end + + subgraph Read["File Input"] + IRD["IpcReader"] end RBB -->|"finish()"| AER AER -->|"write_batch()"| IPC + AER -->|"route()"| PR + PR -->|"per-partition"| PW AER -->|"PyCapsule"| Python["Python ArrowBatch"] + IRD -->|"read_batch()"| AER RecordBatchBuilder ------------------ @@ -42,6 +51,12 @@ Type-safe columnar builder with two modes: ``end_row()`` backfills nulls for missing columns. Best for ``TraceReader.iter_arrow()`` with arbitrary JSON. +Once the first row has been finalized the schema is **locked**: subsequent +rows may only append values into the already-discovered columns, and +attempts to add new columns after the lock are rejected. This makes +batches produced by the dynamic path safe to concatenate across a +``TraceReader::read_arrow()`` stream without re-keying. + String columns store ``string_view`` into source data for zero-copy during build; bulk copy only at ``finish()``. Caller must keep source data alive until ``finish()`` returns. @@ -57,10 +72,38 @@ IpcWriter --------- Streaming Arrow IPC file writer. Writes ``.arrows`` files that can be -read by pyarrow, polars, DuckDB, and any Arrow-compatible tool. +read by pyarrow, polars, DuckDB, and any Arrow-compatible tool. Supports +buffer-level compression: when built with ``DFTRACER_UTILS_ENABLE_ZSTD``, +``IpcCompression::ZSTD`` is available and used by default for new files, +producing pyarrow-compatible compressed IPC streams. + +Guarded by ``DFTRACER_UTILS_ENABLE_ARROW_IPC``. + +IpcReader +--------- + +Streaming Arrow IPC file reader. Mirrors ``IpcWriter`` and yields one +``ArrowExportResult`` per record batch in the file. Supports buffer-level +ZSTD decompression compatible with pyarrow / polars / DuckDB outputs. Guarded by ``DFTRACER_UTILS_ENABLE_ARROW_IPC``. +PartitionWriter +--------------- + +Single-partition Arrow IPC sink with ``PartitionWriteStats`` tracking +(bytes, rows, batches). Used as the per-partition output of +``PartitionRouter`` and individually as a thin wrapper around +``IpcWriter`` when only one output stream is needed. + +PartitionRouter +--------------- + +Multi-partition Arrow router. Takes an inbound ``ArrowExportResult`` plus +a ``PartitionConfig`` (partition key columns, output template, target +batch rows) and dispatches rows into one ``PartitionWriter`` per +partition value. Aggregates ``RouterWriteStats`` across all partitions. + Usage Example ------------- diff --git a/docs/source/cpp_api/coro.rst b/docs/source/cpp_api/coro.rst index 8156f59a..ae405bdd 100644 --- a/docs/source/cpp_api/coro.rst +++ b/docs/source/cpp_api/coro.rst @@ -11,6 +11,16 @@ C++20 coroutine primitives for asynchronous task execution. All classes are in t For usage examples and task scheduling, see :doc:`/pipeline` and :doc:`pipeline/tasks`. +.. note:: + + GCC 12 may corrupt large coroutine frames at ``-O2`` and above, especially + when frames contain references, ``string_view``, or captured lambdas. The + project mitigates this by heap-allocating per-task state in a + ``shared_ptr`` (or ``unique_ptr``) and capturing only the smart pointer in + coroutine lambdas, instead of capturing complex state by value. New + coroutines should follow the same pattern; see ``coroutine-caveats.md`` at + the repo root for the full discussion. + .. mermaid:: graph TD diff --git a/docs/source/cpp_api/dft_aggregators.rst b/docs/source/cpp_api/dft_aggregators.rst index d2dcc18e..ac1d8cc5 100644 --- a/docs/source/cpp_api/dft_aggregators.rst +++ b/docs/source/cpp_api/dft_aggregators.rst @@ -53,7 +53,7 @@ boundary event association, and Perfetto trace output. end subgraph Merge["Merge & Resolve"] - EA["EventAggregatorUtility"] + EA["EventAggregator"] AR["AssociationResolverUtility"] end @@ -180,13 +180,33 @@ predicates for early chunk skipping when available. Tagged ``Parallelizable`` — multiple instances run concurrently across chunks. -EventAggregatorUtility -~~~~~~~~~~~~~~~~~~~~~~ +EventAggregator +~~~~~~~~~~~~~~~ + +Unified event aggregator (formerly ``EventAggregatorUtility`` and the +internal ``RocksDbAggregator``, now merged into one class). Holds a +``RocksDatabase`` handle and merges per-chunk aggregation results into a +unified output, deduplicating file counts and collecting association +trackers for downstream resolution. + +AggregationVisitor +~~~~~~~~~~~~~~~~~~ + +``DftEventVisitor`` subclass that accumulates ``AggregationMetrics`` per +``AggregationKey`` directly from parsed events during a scan, so the +aggregation pass can share a single parse with bloom and manifest +visitors via ``DftEventDispatcher``. Defined in +``dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h``. -Merges per-chunk aggregation results into a unified output. +DftEventDispatcher +~~~~~~~~~~~~~~~~~~ -Combines metrics from all chunks, deduplicates file counts, and -collects association trackers for downstream resolution. +Fan-out adapter that implements the ``IndexVisitor`` interface, parses +each line once, and dispatches the parsed ``DftEvent`` to a list of +registered ``DftEventVisitor`` instances (``BloomVisitor``, +``ManifestVisitor``, ``AggregationVisitor``, ...). This collapses +multiple visitor passes into a single read of the input. Defined in +``dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h``. Association Tracking -------------------- diff --git a/docs/source/cpp_api/indexer.rst b/docs/source/cpp_api/indexer.rst index 407a55f6..035dcccc 100644 --- a/docs/source/cpp_api/indexer.rst +++ b/docs/source/cpp_api/indexer.rst @@ -143,17 +143,50 @@ calls visitors in order: 3. ``on_line(line, checkpoint_idx)`` -- called for every line in the file 4. ``finalize(db, file_id)`` -- called once after the scan to persist results +Indexer / CheckpointIndexer +--------------------------- + +The low-level checkpoint indexer is exposed as ``Indexer`` (formerly named +``BatchIndexer``); the previous ``Indexer`` class is now ``CheckpointIndexer`` +in the internal namespace. ``SingleFileIndexer`` has been removed; use +``IndexBuilderUtility`` or ``IndexBatchBuilderUtility`` instead. + +IndexBatchBuilderUtility +------------------------ + +Batched variant of ``IndexBuilderUtility`` that processes a list of files in +parallel against a shared ``IndexDatabaseWriterContext``, yielding an +``IndexBuildBatchResult`` with aggregated metrics. Configured via +``IndexBuildBatchConfig`` (file list, parallelism, checkpoint size, bloom and +manifest toggles, shared sink). + +IndexBuildBatchConfig +~~~~~~~~~~~~~~~~~~~~~ + +Configuration struct for ``IndexBatchBuilderUtility``: file slices, output +directory, checkpoint size, bloom/manifest flags, and the shared +``IndexBatchSink`` (typically an ``IndexDatabaseWriterContext``) that +receives encoded batches from all workers. + +IndexDatabaseWriterContext +-------------------------- + +Implements ``IndexBatchSink`` and owns a thread-safe writer pipeline into a +RocksDB-backed ``IndexDatabase``. Workers in ``IndexBatchBuilderUtility`` +submit encoded index batches to this context, which serializes them into +checkpoint, bloom, manifest, and statistics column families. + BloomVisitor ------------ -Implements ``IndexVisitor`` to build per-chunk bloom filters and statistics -during the indexing scan. Each checkpoint chunk gets its own set of bloom -filters (one per configured dimension) plus per-chunk event counts and -timestamp/duration distributions. +Implements ``DftEventVisitor`` to build per-chunk bloom filters and +statistics during the indexing scan. Each checkpoint chunk gets its own set +of bloom filters (one per configured dimension) plus per-chunk event counts +and timestamp/duration distributions. .. code-block:: cpp - #include + #include BloomVisitor visitor(bloom_config, {"name", "cat", "pid"}); visitor.begin(num_checkpoints); @@ -168,14 +201,15 @@ timestamp/duration distributions. ManifestVisitor --------------- -Implements ``IndexVisitor`` to build per-checkpoint event routing manifests. -During the scan, it collects which lines belong to which ``(cat, name)`` event -pair within each checkpoint. The resulting manifests enable the reorganization -pipeline to selectively read only the lines needed for a given event group. +Implements ``DftEventVisitor`` to build per-checkpoint event routing +manifests. During the scan, it collects which lines belong to which +``(cat, name)`` event pair within each checkpoint. The resulting manifests +enable the reorganization pipeline to selectively read only the lines needed +for a given event group. .. code-block:: cpp - #include + #include ManifestVisitor visitor; visitor.begin(num_checkpoints); @@ -185,6 +219,14 @@ pipeline to selectively read only the lines needed for a given event group. // Later, query the manifest: auto ranges = db.query_event_ranges_for_checkpoint(file_id, checkpoint_idx); +IndexResolverUtility +-------------------- + +Resolves a directory or file list into a set of ``FileWorkItem`` entries by +opening or building per-file indexes and emitting line-range work items +suitable for parallel scan / aggregation / replay pipelines. Defined in +``dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h``. + ProvenanceDatabase ------------------ diff --git a/docs/source/cpp_api/io.rst b/docs/source/cpp_api/io.rst index 1abd2c0b..feced868 100644 --- a/docs/source/cpp_api/io.rst +++ b/docs/source/cpp_api/io.rst @@ -198,6 +198,39 @@ Example return 0; } +Parallel File Writers +--------------------- + +The ``dftracer/utils/utilities/fileio/parallel/`` module provides +high-throughput multi-stream file writers used by the reorganization and +aggregation pipelines. The unified ``ParallelWriter`` class implements +three on-disk layouts (selected via ``FileLayout`` in +``parallel/layout.h``): + +- **Striped** -- one output file split into Lustre-friendly stripes, + each fed by an independent producer coroutine. +- **Padded striped** -- striped layout with per-stripe alignment padding + for filesystems that prefer aligned writes. +- **Sharded** -- one output file per shard, used when downstream + consumers want independent shards rather than a single concatenated + file. + +Sizing is Lustre-aware: ``LayoutInfo`` and ``WriterSizing`` derive stripe +size and per-stripe buffer counts from the detected ``FilesystemKind`` +(Lustre vs generic POSIX). Internally, writes are coalesced via +``coro::Channel``-based queues so that producer coroutines can submit +small line-sized payloads without per-write ``write()`` syscalls. + +.. code-block:: cpp + + #include + #include + + WriterConfig cfg; + cfg.layout = FileLayout::STRIPED; + cfg.output_path = "merged.pfw"; + ParallelWriter writer(cfg); + Sync Fallback Behavior ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/cpp_api/pipeline/executors.rst b/docs/source/cpp_api/pipeline/executors.rst index 12f66dd0..1d54b70b 100644 --- a/docs/source/cpp_api/pipeline/executors.rst +++ b/docs/source/cpp_api/pipeline/executors.rst @@ -101,7 +101,6 @@ timeout thresholds. All fields have sensible defaults. std::size_t io_pool_size = 4; io::IoBackendType io_backend_type = io::IoBackendType::AUTO; unsigned io_batch_threshold = 16; - std::size_t db_pool_size = 2; }; **Key fields:** @@ -116,7 +115,6 @@ timeout thresholds. All fields have sensible defaults. (``AUTO``, ``IO_URING``, ``THREAD_POOL``). - ``io_batch_threshold`` -- Minimum number of I/O operations to batch before submitting to the backend. -- ``db_pool_size`` -- Number of threads in the dedicated database work pool. **Example -- high-throughput configuration:** @@ -127,7 +125,6 @@ timeout thresholds. All fields have sensible defaults. .io_pool_size = 8, .io_backend_type = io::IoBackendType::IO_URING, .io_batch_threshold = 32, - .db_pool_size = 4 }; Progress Tracking diff --git a/docs/source/cpp_api/reader.rst b/docs/source/cpp_api/reader.rst index dee4669f..b51a562c 100644 --- a/docs/source/cpp_api/reader.rst +++ b/docs/source/cpp_api/reader.rst @@ -22,7 +22,25 @@ decompression. The reader also supports query-based event filtering: when a query string is provided and an index exists, non-matching chunks are pruned entirely, and -per-event filtering is applied to the remaining chunks. +per-event filtering is applied to the remaining chunks. Conjunctions of +equality predicates (``cat == 'io' AND name == 'read'``) are compiled into +a vectorized predicate evaluator that runs against the index bloom dimensions +before any line is decompressed. + +``TraceReader`` also accepts a **directory** as ``file_path``: when given a +directory, it enumerates trace files inside it, opens one indexed reader per +file, and yields lines / Arrow batches in file order. Batch chunk pruning is +delegated to ``ChunkPrunerUtility``, which evaluates the compiled query +against all candidate chunks in one pass and feeds the resulting line-range +work items back to the per-file readers. + +When ``DFTRACER_UTILS_ENABLE_ARROW`` is set, ``TraceReader::read_arrow()`` +exports record batches via the Arrow C Data Interface +(``ArrowExportResult``), which can be sent directly across the FFI boundary +to Python / DuckDB / Polars without a copy. The ``ReadConfig::flatten_objects`` +flag expands one level of nested JSON objects (e.g. ``args``) into +``parent.child`` columns with native Arrow types instead of serializing them +as JSON strings. Getting Started --------------- @@ -120,6 +138,13 @@ filtering. All fields have sensible defaults; pass a default-constructed - ``multi_line`` -- allow multiple lines per raw chunk (default true) - ``buffer_size`` -- internal read buffer size (default 4 MB) - ``query`` -- query DSL string for event filtering (empty = no filter) +- ``chunk_prune_only`` -- when true, the query is used only for chunk-level + pruning via the index; per-line filtering is skipped (caller handles it) +- ``skip_pruning`` -- skip the reader's own chunk pruner pass; the caller's + ``start_line``/``end_line`` window is trusted (used by the checkpoint-level + work-item dispatcher to avoid re-running ``ChunkPrunerUtility`` per item) +- ``flatten_objects`` -- expand one level of nested JSON objects into + ``parent.child`` columns with native Arrow types in ``read_arrow()`` Helper methods: ``has_line_range()`` and ``has_byte_range()`` test whether non-default range bounds have been set. @@ -135,7 +160,9 @@ indexed) based on whether an index exists and what range the caller requests. **Async generators:** - ``read_lines(config)`` -- yields ``Line`` structs (``content`` + ``line_number``) with optional query filtering and chunk pruning +- ``read_json(config)`` -- yields ``JsonLine`` records (parsed once with simdjson) for callers that would otherwise re-parse each line - ``read_raw(config)`` -- yields ``std::span`` byte chunks +- ``read_arrow(config, batch_size)`` -- yields ``ArrowExportResult`` record batches via the Arrow C Data Interface (requires ``DFTRACER_UTILS_ENABLE_ARROW``) **Metadata queries:** diff --git a/docs/source/cpp_api/rocksdb.rst b/docs/source/cpp_api/rocksdb.rst index 76a6f77a..35695841 100644 --- a/docs/source/cpp_api/rocksdb.rst +++ b/docs/source/cpp_api/rocksdb.rst @@ -8,10 +8,12 @@ RocksDB migration. It includes: - database wrappers and lifecycle management -- async awaitables for database work on executor-backed threads +- column-family and merge-operator registration for the ``.dftindex`` schema - key encoding helpers for typed prefix/range scans - manager utilities for sharing open database handles across readers, indexers, and higher-level composites +- bulk-ingest helpers (``SstFileWriter`` + ``IngestExternalFile``) used by + the distributed indexing pipeline Architecture ------------ @@ -23,10 +25,9 @@ Architecture Indexers["Indexer / provenance writers"] --> Manager Manager --> Database["RocksDatabase"] Database --> CFs["Column families"] - Database --> Async["DbAwaitable / rocks::run"] Database --> Codec["KeyCodec"] + Database --> Merge["MergeOperators (AGGREGATION, SYSTEM_METRICS)"] CFs --> Store[".dftindex / provenance store"] - Async --> Runtime["Executor-backed threads"] Codec --> Store See also: diff --git a/docs/source/developers.rst b/docs/source/developers.rst index 52583fb1..79c628be 100644 --- a/docs/source/developers.rst +++ b/docs/source/developers.rst @@ -331,27 +331,33 @@ For hot loops, reuse a single ``HasherUtility`` instance with ``reset()``: Anti-Patterns to Avoid ~~~~~~~~~~~~~~~~~~~~~~ -**Storing JsonValue beyond yyjson_doc lifetime** +**Storing JsonValue / simdjson views beyond the parser's lifetime** -``JsonValue`` is a non-owning view into a ``yyjson_doc``. Never store it across the document's lifetime. +``JsonValue`` (and the underlying ``simdjson::ondemand::value`` / +``simdjson::dom::element``) is a non-owning view into the parser's buffer. +Never store it across the parser's or the input buffer's lifetime. .. code-block:: cpp - // WRONG: doc destroyed, but view stored + #include + + // WRONG: parser/buffer destroyed, but view stored JsonValue stored_value; { - yyjson_doc* doc = yyjson_read_file("config.json", NULL); - stored_value = yyjson_get_obj(doc); - yyjson_doc_free(doc); + simdjson::ondemand::parser parser; + auto padded = simdjson::padded_string::load("config.json"); + auto doc = parser.iterate(padded); + stored_value = doc.find_field("root").value(); } - // stored_value now points to freed memory! - - // CORRECT: copy data before doc destruction + // stored_value now points into freed parser/buffer memory! + + // CORRECT: copy the data out before the parser goes out of scope { - yyjson_doc* doc = yyjson_read_file("config.json", NULL); - auto data = serialize_json_value(yyjson_get_obj(doc)); - yyjson_doc_free(doc); - // data is now safe + simdjson::ondemand::parser parser; + auto padded = simdjson::padded_string::load("config.json"); + auto doc = parser.iterate(padded); + auto data = serialize_json_value(doc.find_field("root").value()); + // data owns its copy; safe to use after the parser is destroyed } **Instantiating IOExecutor directly** diff --git a/docs/source/installation.rst b/docs/source/installation.rst index d3637b6d..609ae883 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -81,6 +81,51 @@ To install to a custom location: make make install +Build Options +~~~~~~~~~~~~~ + +The following CMake options control optional features and dependencies. All +options default to ``ON`` unless noted otherwise: + +- ``DFTRACER_UTILS_TESTS`` (default ``OFF``) - Build the test suite. +- ``DFTRACER_UTILS_COVERAGE`` (default ``OFF``) - Enable coverage reporting. +- ``DFTRACER_UTILS_DEBUG`` (default ``OFF``) - Enable debug mode with verbose + logging. +- ``DFTRACER_UTILS_BUILD_SHARED`` (default ``ON``) - Build the shared library. +- ``DFTRACER_UTILS_BUILD_STATIC`` (default ``ON``) - Build the static library. +- ``DFTRACER_UTILS_BUILD_BINARIES`` (default ``ON``) - Build command-line + binaries. +- ``DFTRACER_UTILS_BUILD_PYTHON`` (default ``OFF``) - Build Python bindings. +- ``DFTRACER_UTILS_ENABLE_PCH`` (default ``ON``) - Enable precompiled + headers. +- ``DFTRACER_UTILS_ENABLE_ASAN`` / ``_UBSAN`` / ``_TSAN`` (default ``OFF``) - + Address / undefined-behavior / thread sanitizers. +- ``DFTRACER_UTILS_ENABLE_MPI`` (default ``OFF``) - Enable MPI support; + required to build ``dftracer_aggregator_mpi`` and + ``dftracer_call_tree_mpi``. +- ``DFTRACER_USE_ZLIB_NG`` (default ``ON``) - Use ``zlib-ng`` (compat ABI) + for faster compression and decompression. Falls back to ``madler/zlib`` + if zlib-ng fetch or build fails. +- ``DFTRACER_UTILS_ENABLE_ARROW`` (default ``ON``) - Enable the Arrow C Data + Interface via nanoarrow (required for Python Arrow output). +- ``DFTRACER_UTILS_ENABLE_ARROW_IPC`` (default ``ON``) - Enable Arrow IPC + file read/write via nanoarrow. Required for ``dftracer_aggregator + --format arrow`` output and for the ``save_arrow`` / ``load_arrow`` call- + tree serialization paths. +- ``DFTRACER_UTILS_ENABLE_ZSTD`` (default ``ON``) - Enable ZSTD compression + for RocksDB SST blocks. +- ``DFTRACER_UTILS_ENABLE_LZ4`` (default ``OFF``) - Enable LZ4 compression + for RocksDB SST blocks. + +Example: + +.. code-block:: bash + + cmake .. \ + -DDFTRACER_UTILS_ENABLE_MPI=ON \ + -DDFTRACER_UTILS_ENABLE_ARROW_IPC=ON \ + -DDFTRACER_USE_ZLIB_NG=ON + Verifying Installation ---------------------- @@ -97,23 +142,21 @@ To verify your Python installation: C++ ~~~ -To verify your C++ installation, try compiling a simple example: +To verify your C++ installation, try compiling a simple example that +opens a trace through the public ``TraceReader`` API: .. code-block:: cpp - #include + #include #include int main() { - // Create an indexer to verify installation - auto indexer = dftracer::utils::IndexerFactory::create( - "test.pfw.gz", - "test.pfw.gz.idx", - false // Don't force rebuild - ); + using dftracer::utils::utilities::reader::TraceReader; + TraceReader reader("test.pfw.gz"); std::cout << "Library installed successfully!" << std::endl; - std::cout << "Archive format: " << indexer->get_format_name() << std::endl; + std::cout << "Has index: " << std::boolalpha + << reader.has_index() << std::endl; return 0; } diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst index 43cbc422..05faad8a 100644 --- a/docs/source/pipeline.rst +++ b/docs/source/pipeline.rst @@ -1016,6 +1016,91 @@ The project has migrated from the old ``TaskContext``/``TaskScope`` API to the n - ``PipelineConfig`` configures the executor (threads, timeouts, watchdog) - ``Pipeline::execute()`` blocks until all work completes +Pipelined Replay +---------------- + +``dftracer_replay`` was refactored onto the same coroutine + channel model +documented above. The replay engine now expresses parsing, decoding, and +execution as three stages connected by bounded channels, eliminating the +old synchronous pre-load step. Three end-to-end improvements landed +together: + +- ``JsonParser`` is shared between the parse and execute stages; the + trace JSON is decoded incrementally instead of being slurped into a + ``std::vector`` before execution starts. +- Buffer reuse and zero-copy string handling are wired through the I/O + read path, removing per-line allocations in the hot loop. +- Stages communicate via ``Channel`` instances with backpressure, + so a slow execute stage no longer forces the parse stage to materialize + the entire trace. + +The replay binary is otherwise unchanged from a CLI perspective; see the +``dftracer_replay`` section in :doc:`cli` for flag documentation. + +Memory Budget Control for Streaming Iterators +--------------------------------------------- + +The ``MemoryBudget`` helpers in +``dftracer/utils/core/common/memory_budget.h`` give utilities a single +place to size streaming channels and per-file batch counts based on +available system memory: + +.. code-block:: cpp + + #include + + using namespace dftracer::utils; + + // 50% of available RAM by default; clamped to >= 64 MiB + const std::size_t budget = compute_memory_budget(); + + // Or honor a user override (in bytes); 0 falls back to auto-detect + const std::size_t budget_user = + compute_memory_budget(/*user_override_bytes=*/4ULL << 30); + + // Per-file expansion factor + sample probing yields a per-file peak + const std::size_t per_file = + estimate_per_file_bytes(file_sizes_in_bytes); + + // Derive channel capacity and per-flush batch size + const std::size_t cap = + compute_channel_capacity(budget, estimated_batch_bytes, num_workers); + const std::size_t batch = + compute_file_batch_size(budget, per_file, /*min_files=*/4); + +The Python ``TraceReader`` exposes the same control as a ``memory_budget`` +keyword on its streaming iterators (``iter_lines``, ``iter_lines_json``, +``iter_raw``, ``iter_arrow``). Passing ``0`` keeps the auto-detect default; +passing a positive integer caps the in-flight bytes across the underlying +``Channel`` instances. + +``flush_every_files`` for Batched Index Writes +---------------------------------------------- + +``dftracer_organize`` exposes the underlying batched-index control via +``--memory-budget-mb``: the binary derives a ``flush_every_files`` value +from the budget and feeds it to ``IndexBuildBatchConfig``. Each batch of +``flush_every_files`` files is fully indexed and flushed before the next +batch begins, capping peak memory regardless of trace count. + +When constructing an ``IndexBuildBatchConfig`` directly from C++: + +.. code-block:: cpp + + auto batch_config = std::make_shared(); + batch_config->file_paths = files; + batch_config->index_dir = index_dir; + batch_config->checkpoint_size = checkpoint_size; + batch_config->parallelism = executor_threads; + batch_config->flush_every_files = compute_file_batch_size( + compute_memory_budget(), + estimate_per_file_bytes(file_sizes), + /*min_files=*/4); + +A ``flush_every_files`` of ``0`` (the default) disables sub-batching and +processes every file in one shot, which is fastest for small inputs but +not memory-safe at scale. + API Reference ------------- diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 11cbee48..99b3e5e5 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -18,6 +18,10 @@ The most common use case is reading trace files: # Open a compressed trace file (auto-detects index sidecar) reader = TraceReader("trace.pfw.gz") + # ...or pass a directory; TraceReader scans for .pfw / .pfw.gz files + # and streams them transparently as a single logical input. + reader = TraceReader("./traces") + # Read all lines lines = reader.read_lines() for line in lines: diff --git a/docs/source/server.rst b/docs/source/server.rst index ccd82385..947df27a 100644 --- a/docs/source/server.rst +++ b/docs/source/server.rst @@ -410,6 +410,17 @@ The server uses coroutine-based concurrency to handle multiple simultaneous requ Event filtering streams through bloom indexes and partial reads, minimizing memory usage. Both ``/api/v1/events`` and ``/api/v1/events/stream`` use chunked transfer encoding with iovec scatter-gather I/O, streaming NDJSON results without buffering the full response in memory. +**Client Receive Timeouts:** + +The streaming endpoints (``/api/v1/events``, ``/api/v1/events/stream``, +``/api/v1/viz/events``) use HTTP/1.1 chunked transfer encoding and can hold +a connection open while the server is still scanning chunks before any +bytes are emitted. Clients should set a receive timeout of at least +**15 seconds** (the timeout used by the bundled integration tests, raised +from 2 s in earlier builds) to accommodate the worst-case index-warmup +path; the server itself does not impose a global request timeout +(``with_global_timeout(0)``). + **Query Optimization:** - Use narrow time ranges in ``/api/v1/viz/events`` queries diff --git a/docs/source/utilities/common.rst b/docs/source/utilities/common.rst index 7c71e953..3b571b29 100644 --- a/docs/source/utilities/common.rst +++ b/docs/source/utilities/common.rst @@ -7,26 +7,26 @@ statistics collection, and Arrow data interchange. JSON ---- -Lightweight zero-cost wrapper around `yyjson `_ for lazy JSON evaluation. +JSON parsing uses `simdjson `_ exclusively (DOM and On-Demand APIs). ``JsonValue`` is a lightweight wrapper around ``simdjson::dom::element``; ``JsonParser`` exposes the On-Demand API for zero-copy lazy field access. .. code-block:: cpp #include - #include + #include + #include JsonValue ~~~~~~~~~ -Non-owning view over parsed JSON data with fluent navigation and type-safe accessors. +Wrapper over ``simdjson::dom::element`` with fluent navigation and type-safe accessors. Non-owning: only valid while the backing ``simdjson::dom::document`` is alive. **Parse and navigate:** .. code-block:: cpp - yyjson_doc* doc = yyjson_read(json_str.c_str(), json_str.size(), 0); - JsonDocGuard guard(doc); // RAII cleanup - - JsonValue root(yyjson_doc_get_root(doc)); + simdjson::dom::parser parser; + simdjson::dom::element doc = parser.parse(json_str); + JsonValue root(doc); // Fluent navigation with defaults std::string name = root["metadata"]["name"].get("unknown"); @@ -61,24 +61,43 @@ Non-owning view over parsed JSON data with fluent navigation and type-safe acces if (val.is_array()) { /* ... */ } if (val.exists()) { /* not null */ } -.. warning:: - - ``JsonValue`` is a non-owning view. It is only valid while the ``yyjson_doc`` is alive. Use ``JsonDocGuard`` for RAII lifetime management. - JsonDocGuard ~~~~~~~~~~~~ -RAII guard for ``yyjson_doc*`` to prevent leaks on exceptions or early coroutine returns. +RAII helper that owns a ``simdjson::dom::parser``; ``parse(data, len)`` reuses +the parser buffer and ``root()`` returns the parsed element. Use across +short-lived parse sites; ``StringJsonParserUtility`` is preferred when the +document must outlive a ``co_await`` boundary. .. code-block:: cpp - { - yyjson_doc* doc = yyjson_read(data, len, 0); - JsonDocGuard guard(doc); - - JsonValue root(yyjson_doc_get_root(doc)); + JsonDocGuard guard; + if (guard.parse(data, len)) { + JsonValue root(guard.root()); // ... use root ... - } // guard destructor frees doc + } + +JsonParser (On-Demand) +~~~~~~~~~~~~~~~~~~~~~~ + +On-Demand parser for zero-copy lazy field access. Reuses an internal padded +buffer across rows; ``string_view`` results are valid until the next +``parse()`` call. Used by the indexing visitors and ``TraceReader::read_json``. + +.. code-block:: cpp + + JsonParser parser; + + for (auto& line : input_lines) { + if (!parser.parse(line)) continue; + auto name = parser.get_string("name"); + auto ts = parser.get_int64("ts"); + + parser.for_each_field("args", [](std::string_view k, + simdjson::ondemand::value v) { + // process nested fields + }); + } StringJsonParserUtility ~~~~~~~~~~~~~~~~~~~~~~~ @@ -100,6 +119,44 @@ Parses JSON strings with owned document lifetime. Safe for use across ``co_await parser.reset(); // Cleanup +ArgsMap and ArgsValueProxy +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Owned key/value map used for trace event ``args``. Replaces ``JsonValue`` for +event-args storage in the DFT composites: keys are interned with the global +:cpp:class:`dftracer::utils::StringIntern` and values are a typed +``std::variant`` (string, int64, uint64, double, bool). ``ArgsValueProxy`` +mirrors the ``JsonValue`` accessor surface (``get``, ``get_optional``, +``is_string()`` ...) so event visitors can be written generically. + +.. code-block:: cpp + + #include + + using dftracer::utils::utilities::composites::dft::ArgsMap; + + ArgsMap args; + args.insert("hhash", std::uint64_t{0x1234}); + args.set_valid(true); + + uint64_t h = args["hhash"].get(0); + args.for_each_member([](std::string_view k, auto v) { /* ... */ }); + +JsonDictValue (Python) +~~~~~~~~~~~~~~~~~~~~~~ + +Python-facing wrapper that exposes a parsed JSON object as a lazy +``Mapping``. Used by the ``TraceReader`` Python binding to surface parsed +events without materialising a ``dict`` per row. Defined in +``src/dftracer/utils/python/json.h``. + +.. note:: + + :cpp:class:`dftracer::utils::StringIntern` was reimplemented as a + lock-free open-chained hash table with a fast-path id table + (``FAST_CAPACITY = 1<<20``). Lookups are fully lock-free; only the rare + first insert of a string takes the insertion mutex. + Query ----- @@ -190,6 +247,7 @@ Percentile estimation and histogram utilities for trace analysis. #include #include + #include DDSketch ~~~~~~~~ @@ -274,6 +332,24 @@ Fixed 65-bin logarithmic histogram covering the ``uint64_t`` range. Bin 0 holds std::string json = a.to_json(); Log2Histogram restored = Log2Histogram::from_json(json); +TimestampHistogram +~~~~~~~~~~~~~~~~~~ + +Sparse fixed-width (100 ms) histogram over event timestamps. Used by the +chunk pruner to compute time-range selectivity and to weight sub-bucket +expansions for adaptive aggregation. + +.. code-block:: cpp + + TimestampHistogram th; + for (auto ts_us : timestamps) th.add(ts_us); + + std::uint64_t in_window = th.count_in_range(ts_lo, ts_hi); + double sel = th.selectivity(ts_lo, ts_hi); + + auto bytes = th.serialize(); + auto restored = TimestampHistogram::deserialize(bytes.data(), bytes.size()); + Arrow ----- diff --git a/docs/source/utilities/composites.rst b/docs/source/utilities/composites.rst index 4e1ea430..35e0736a 100644 --- a/docs/source/utilities/composites.rst +++ b/docs/source/utilities/composites.rst @@ -502,10 +502,44 @@ Complete example of gathering statistics from a DFTracer trace file: std::cout << "Duration p99: " << stats.merged.duration_sketch.quantile(0.99) << " us" << std::endl; +DFT Event Pipeline +------------------ + +DftEventDispatcher +~~~~~~~~~~~~~~~~~~ + +Adapter that turns a list of ``DftEventVisitor`` instances into a single +``IndexVisitor`` consumable by ``IndexBuilderUtility``. Owns a per-instance +``JsonParser`` and parses each decompressed line once before fanning out to +the configured visitors (``BloomVisitor``, ``ManifestVisitor``, +``AggregationVisitor``, ...). Supports a ``force_serial`` mode for +deterministic-order replays. + +.. code-block:: cpp + + #include + + std::vector> visitors; + visitors.push_back(std::make_unique(...)); + visitors.push_back(std::make_unique(...)); + DftEventDispatcher dispatcher(std::move(visitors)); + +AggregationVisitor +~~~~~~~~~~~~~~~~~~ + +Emits per-chunk aggregation + system-metric merge operands into the +distributed aggregation column families. Pairs with +``AggregationMergeOperator`` / ``SystemMetricsMergeOperator`` for +distributed reduction; lives in +``composites/dft/aggregators/aggregation_visitor.h``. + Reorganization Pipeline ----------------------- -Parallel event routing for reorganizing traces by query-based groups. +Parallel event routing for reorganizing traces by query-based groups. The +``organize`` flow is a streaming pipeline that fans events through visitor +groups, batches output, and periodically flushes group writers +(``GroupWriterTask``) to bound peak memory. ChunkWriter ~~~~~~~~~~~ @@ -562,6 +596,15 @@ Tracks source-to-output mapping during reorganization. Records which source file and line produced each output event, enabling reconstruction of original traces from reorganized files via ``dftracer_reconstruct``. +ReconstructorUtility +~~~~~~~~~~~~~~~~~~~~ + +Streaming reconstruction pipeline that inverts the organize pipeline: +plans a reconstruction over a ``.pidx`` provenance store, fans out per-source +read tasks through coroutines and channels, and merges results in +original-order back into the requested output. Defined in +``composites/dft/reorganize/reconstructor_utility.h``. + Comparison ---------- diff --git a/docs/source/utilities/compression.rst b/docs/source/utilities/compression.rst index 7d5d5ca4..23ecacf6 100644 --- a/docs/source/utilities/compression.rst +++ b/docs/source/utilities/compression.rst @@ -4,6 +4,21 @@ Compression Streaming zlib compression and decompression utilities supporting GZIP, ZLIB, and DEFLATE formats. All compression operates in streaming mode using zero-copy ``ByteView`` chunks. +.. note:: + + The default gzip level used by the writer pipeline (``dftracer_aggregator``, + ``dftracer_organize``, parallel writers) is ``1`` (fastest); previous + releases defaulted to ``Z_DEFAULT_COMPRESSION`` (6). Override per-call with + the ``compression_level`` field on ``ManualStreamingCompressorUtility``. + +.. note:: + + The build defaults to zlib-ng (compat ABI) when the ``DFTRACER_USE_ZLIB_NG`` + CMake option is ``ON`` (the default), falling back to ``madler/zlib`` if + zlib-ng cannot be added. The compressor sources are unchanged: the same + ``deflate``/``inflate`` symbols are linked against whichever backend was + selected at configure time. + .. code-block:: cpp #include @@ -59,6 +74,18 @@ Yields compressed chunks as ``ByteView`` references into an internal buffer. std::size_t bytes_out = compressor.total_bytes_out(); double ratio = compressor.compression_ratio(); +Buffered Compression +-------------------- + +Writer pipelines (parallel writer, perfetto trace writer, organize group +writers) buffer compressed payloads and flush at a configurable +``flush_threshold``. The threshold is computed by +``compute_writer_sizing()`` from the detected filesystem layout +(``LayoutInfo``): on Lustre/GPFS the threshold is sized to the PFS stripe +so each compressed flush fits one stripe; on local FS it is ``max(default, +stripe_size)``. Buffer capacity is always ``flush_threshold + +buffer_headroom``. + StreamingDecompressorUtility ---------------------------- diff --git a/docs/source/utilities/fileio.rst b/docs/source/utilities/fileio.rst index 01e0db80..2d673601 100644 --- a/docs/source/utilities/fileio.rst +++ b/docs/source/utilities/fileio.rst @@ -248,6 +248,68 @@ Read lines from ``.gz`` files without building an index, using streaming decompr process(*line); } +Parallel Writers +---------------- + +Layout-aware parallel writers for multi-worker output. The ``ParallelWriter`` +interface is implemented by three concrete layouts under +``fileio/parallel/``: + +- **StripedWriter** — single output file, atomic-offset ``pwrite`` per + worker. Used on local FS and PFS without padded stripes. +- **PaddedStripedWriter** — single output file where each worker chunk is + padded to a full PFS stripe so per-stripe writes never cross workers. + Recommended for Lustre/GPFS when the stripe size is at least + ``MIN_PADDED_STRIPE_BYTES`` (1 MiB). +- **ShardedWriter** — N output files, one per worker, glob-named by + ordinal. Used on NFS where atomic-offset ``pwrite`` is not reliable. + +.. code-block:: cpp + + #include + #include + + using namespace dftracer::utils::utilities::fileio::parallel; + + auto info = detect_layout("/lustre/.../output.pfw.gz"); + auto sizing = compute_writer_sizing(info, /*baseline_workers=*/64, + /*default_flush=*/4 << 20, + /*headroom=*/1 << 20, + /*padded=*/true); + + WriterConfig cfg{ + .layout = info.layout, + .stripe_size = info.stripe_size, + .gzip = true, + }; + auto writer = make_writer(cfg); + co_await writer->open("output.pfw.gz", sizing.num_workers, + /*gzip_extension=*/true, scope); + + co_await writer->write_header(header_bytes); + co_await writer->write_chunk(worker_id, chunk_bytes); + auto member = writer->last_member(worker_id); // offset+length of the gzip member + co_await writer->write_footer(footer_bytes); + co_await writer->close(); + +The writer collects per-chunk ``MemberSpan`` entries (offset + length of +each independently decompressable gzip member) and exposes them via +``member_layout()`` after close. ``shard_base_offsets()`` remaps shard-local +offsets to merged-file offsets for sharded layouts. + +Layout detection (``detect_layout``) classifies a path's filesystem as +Lustre, GPFS, BeeGFS, NFS, or LOCAL and picks ``SHARDED`` on NFS, +``STRIPED`` elsewhere; ``compute_writer_sizing`` caps worker count at the +PFS stripe count and sets ``flush_threshold`` to the stripe size for +padded layouts so each compressed flush coalesces into one stripe. + +.. note:: + + Compressor generators consumed by the parallel writer are wrapped in + smart pointers (``std::unique_ptr``) + so they can be moved across coroutine frames without leaking the + underlying zlib stream. + Async vs Synchronous -------------------- diff --git a/docs/source/utilities/indexer.rst b/docs/source/utilities/indexer.rst index 6306ecd8..60e8bf42 100644 --- a/docs/source/utilities/indexer.rst +++ b/docs/source/utilities/indexer.rst @@ -1,7 +1,7 @@ Indexer ================= -Unified indexing and reading infrastructure for compressed trace files. Builds sidecar ``.idx`` files that enable efficient random access, bloom-filter-accelerated queries, and event-level manifest routing — all in a single decompression pass. +Unified indexing and reading infrastructure for compressed trace files. Builds a sidecar ``.dftindex`` RocksDB store (and optional flat-file SSTs) that enables efficient random access, bloom-filter-accelerated queries, event-level manifest routing, and distributed aggregation, all from a single decompression pass. .. code-block:: cpp @@ -11,19 +11,25 @@ Unified indexing and reading infrastructure for compressed trace files. Builds s Overview -------- -The indexer builds sidecar ``.idx`` files with an additive SQLite schema: +The indexer writes column families into a shared ``.dftindex`` RocksDB store +(or, for distributed builds, a content-addressed SST staging directory that +is ingested into the store): - **Checkpoints** — byte offsets and decompression dictionaries for random access - **Bloom filters** — per-chunk bloom filters for fast event filtering (optional) - **Chunk statistics** — per-chunk event counts, duration distributions (optional) -- **Manifest** — per-chunk event-to-line routing for reorganization (optional) +- **Manifest** — per-chunk (cat, name) -> line numbers for sparse query routing (optional) +- **Aggregation / system metrics** — distributed aggregation CFs populated via + ``SstFileWriter::Merge`` operands -A separate ``.pidx`` file stores provenance data for reorganization tracking. +SST files staged on disk are **content-addressed** (FNV-1a 64-bit fingerprint +over the SST payload) so identical SSTs produced by different ranks collapse +to a single ingest, and re-ingesting is idempotent. String IDs in the +``names`` and ``cats`` CFs are deterministic FNV-1a hashes so the same name +maps to the same id across processes. -Sidecar files: - -- ``.idx`` — Unified content index (checkpoints + bloom filters + chunk statistics + manifest) -- ``.pidx`` — Provenance index (reorganization tracking) +A separate ``.pidx`` provenance store tracks source-to-output mapping for +reorganized files. IndexBuilder ------------ @@ -64,6 +70,54 @@ Single-pass index builder. Decompresses each file once and builds all requested // Later: all features present, skips entirely co_await builder.process(config2); // "Skipping already-indexed file" +IndexBatchBuilderUtility +------------------------ + +Builds many files in a single pipelined pass. Parses files in parallel +(``parallelism`` workers) and routes their parsed artifacts (bloom rows, +manifest entries, aggregation merge operands, extra-visitor SSTs) to a +write phase. Supports batched flushing (``flush_every_files``) to bound +peak memory, distributed SST sinks via ``sink_factory`` / ``sink_commit``, +preassigned file ids, and per-file gzip-member slicing for cross-rank file +splitting (the MPI driver pre-scans each ``.pfw.gz`` for member boundaries +and assigns disjoint ``[member_begin, member_end)`` ranges to ranks). + +.. code-block:: cpp + + #include + + IndexBuildBatchConfig cfg; + cfg.file_paths = {"a.pfw.gz", "b.pfw.gz", "c.pfw.gz"}; + cfg.index_dir = "/data/.dftindex"; + cfg.parallelism = 16; + cfg.build_manifest = true; + cfg.use_batch_write = true; + cfg.rebuild_root_summaries = true; + cfg.flush_every_files = 8; + + auto batch = co_await IndexBatchBuilderUtility::process(scope, + std::make_shared(std::move(cfg))); + +IndexDatabaseWriterContext +-------------------------- + +Implements ``IndexBatchSink`` over a coordinator-owned RocksDB store: each +batch's parsed artifacts are buffered, then committed atomically via +``WriteBatch``. ``IndexDatabaseSstWriterContext`` is the SST-staging +variant used by the distributed indexer; its outputs are content-addressed +SST files later ingested into the coordinator store. + +IndexResolverUtility +-------------------- + +Resolves the index directory for a given trace file, building the index on +demand when ``auto_build_index`` is set. Lives in +``composites/dft/indexing/`` because it depends on the DFT visitor set. + +.. code-block:: cpp + + #include + IndexDatabase ------------- @@ -192,10 +246,17 @@ Interface for processing decompressed lines during index building. Implementatio virtual void finalize(IndexDatabase& db, int file_id) = 0; }; -Built-in visitors: - -- **BloomVisitor** — parses JSON events, populates bloom filters and chunk statistics -- **ManifestVisitor** — tracks (category, name) to line number mappings per checkpoint +Built-in event visitors live in ``composites/dft/visitors/`` (they extend +``DftEventVisitor`` and are wrapped by ``DftEventDispatcher``, which +implements ``IndexVisitor``): + +- **BloomVisitor** (``composites/dft/visitors/bloom_visitor.h``) - parses + JSON events, populates bloom filters and chunk statistics +- **ManifestVisitor** (``composites/dft/visitors/manifest_visitor.h``) - + tracks (category, name) -> line numbers per checkpoint for sparse query + acceleration +- **AggregationVisitor** (``composites/dft/aggregators/aggregation_visitor.h``) + - emits per-chunk aggregation and system-metric merge operands Low-level IndexerFactory ------------------------ diff --git a/docs/source/utilities/reader.rst b/docs/source/utilities/reader.rst index 4bd39946..b8695e37 100644 --- a/docs/source/utilities/reader.rst +++ b/docs/source/utilities/reader.rst @@ -15,7 +15,39 @@ Streaming reader for compressed trace files with support for line-based and byte Overview -------- -The reader provides random access into indexed compressed files (``.pfw.gz`` + ``.idx``). It supports multiple stream types for different access patterns and both synchronous and asynchronous reads. +The reader provides random access into indexed compressed files +(``.pfw.gz`` + ``.dftindex``). It supports multiple stream types for +different access patterns and both synchronous and asynchronous reads. + +The high-level :cpp:class:`dftracer::utils::utilities::reader::TraceReader` +exposes: + +- **Directory input** (Python binding): when constructed with a directory, + all matching ``.pfw.gz`` files share one ``.dftindex`` root and are + processed in parallel (each file becomes one or more checkpoint-level + work items routed across the runtime thread pool). +- **JSON streaming** (``read_json``): each line is parsed once with a + reused ``simdjson`` ondemand ``JsonParser``; the yielded ``JsonLine`` + borrows the parser until the next ``next()`` call. +- **Arrow streaming** (``read_arrow``, Python ``iter_arrow_stream``): + yields native ``ArrowExportResult`` record batches sized at + ``batch_size`` rows. The Python binding exposes this as an Arrow C + Data Interface stream (no Python-side row materialisation). +- **Query filtering**: an optional ``query`` DSL string is compiled into + AND-of-EQ probes when possible. The compiled probes evaluate directly + against simdjson fields, with a uniform-match shortcut when every + candidate chunk fully matches the predicate (no per-event re-evaluation). +- **Line-range work items**: the dispatcher splits a file's checkpoints + into independent line-range work items that the runtime executes in + parallel; ``ReadConfig::skip_pruning`` lets the dispatcher avoid + re-running the chunk pruner per work item. +- **Batch chunk pruning**: a single pruner pass per file feeds all work + items, using ``ChunkPrunerUtility`` over bloom filters, chunk + statistics, and the manifest CF. +- **flatten_objects**: when set, top-level JSON object values + (e.g. ``args``) are expanded one level into ``parent.child`` columns + with native Arrow types; deeper nesting still round-trips as a JSON + text column. ReaderFactory ------------- diff --git a/docs/source/utilities/replay.rst b/docs/source/utilities/replay.rst index 44556984..0302d170 100644 --- a/docs/source/utilities/replay.rst +++ b/docs/source/utilities/replay.rst @@ -3,6 +3,18 @@ Replay The replay utility replays DFTracer trace files by reading recorded events and executing them in a configurable replay mode. It supports plain text and gzipped traces, dry-run analysis, timing-aware replay, and filtered execution for focused testing. +.. note:: + + The engine is now pipelined with C++20 coroutines and channels: trace + reading, JSON parsing, filtering, and execution run as concurrent stages + communicating through bounded channels, so a slow executor no longer + blocks the reader. JSON parsing uses the shared + :cpp:class:`dftracer::utils::utilities::common::json::JsonParser` + (on-demand simdjson) which reuses one padded buffer per stage. String + handling and file I/O have been re-tuned with a fixed read buffer and + ``string_view`` line slicing; the public ``ReplayEngine`` / + ``ReplayConfig`` / ``ReplayResult`` API is unchanged. + .. code-block:: cpp #include diff --git a/examples/call_tree_example1.cpp b/examples/call_tree_example1.cpp index 490be316..b5544c75 100644 --- a/examples/call_tree_example1.cpp +++ b/examples/call_tree_example1.cpp @@ -4,9 +4,19 @@ */ #include +#include +#include +#include +#include +#include + #include using namespace dftracer::utils::call_tree; +using dftracer::utils::CoroScope; +using dftracer::utils::Pipeline; +using dftracer::utils::make_task; +namespace coro = dftracer::utils::coro; int main(int argc, char* argv[]) { printf("=== CallTree API Example 1: Basic Usage ===\n"); @@ -59,30 +69,29 @@ int main(int argc, char* argv[]) { } printf("\n"); - // Save to file - printf("Step 6: Serialize call tree to binary file\n"); - std::string output_file = tree.get_output_path(); - printf(" Default output path: %s\n", output_file.c_str()); - - if (tree.save_to_file()) { - printf(" Successfully saved!\n"); - } - printf("\n"); - - // Save to JSON format - printf("Step 7: Serialize call tree to JSON (Chrome Tracing format)\n"); - if (tree.save_to_json()) { - printf(" Successfully saved to JSON!\n"); + // Step 6: persist via the coroutine save APIs driven by a Pipeline. + printf("Step 6: Save call tree (custom binary + Arrow IPC)\n"); + const std::string bin_path = "nodes-1_calltree.bin"; + const std::string arrow_path = "nodes-1_calltree.arrow"; + bool bin_ok = false, arrow_ok = false; + { + Pipeline pipeline; + auto save = make_task( + [&](CoroScope& scope) -> coro::CoroTask { + bin_ok = co_await save_binary(&scope, tree.internal_tree(), + bin_path); + arrow_ok = co_await save_arrow(&scope, tree.internal_tree(), + arrow_path); + }, + "save_call_tree"); + pipeline.set_source(save); + pipeline.set_destination(save); + pipeline.execute(); } - printf("\n"); - - // Print tree to text file - printf("Step 8: Export call tree to text file\n"); - std::string text_file = "nodes-1_calltree.txt"; - if (tree.print_depth_first_to_file(text_file)) { - printf(" Exported to: %s\n", text_file.c_str()); - } - + printf(" Binary: %s -> %s\n", bin_path.c_str(), bin_ok ? "ok" : "failed"); + printf(" Arrow: %s -> %s\n", arrow_path.c_str(), + arrow_ok ? "ok" : "failed"); + printf("\n=== Example completed successfully ===\n"); return 0; diff --git a/examples/call_tree_example2.cpp b/examples/call_tree_example2.cpp index ab3bdc70..0b4fb4d7 100644 --- a/examples/call_tree_example2.cpp +++ b/examples/call_tree_example2.cpp @@ -4,10 +4,20 @@ */ #include +#include +#include +#include +#include +#include + #include #include using namespace dftracer::utils::call_tree; +using dftracer::utils::CoroScope; +using dftracer::utils::Pipeline; +using dftracer::utils::make_task; +namespace coro = dftracer::utils::coro; int main(int argc, char* argv[]) { printf("=== CallTree API Example 2: Multi-Node Traces ===\n"); @@ -84,29 +94,29 @@ int main(int argc, char* argv[]) { } printf("\n"); - // Save outputs printf("--- Saving Outputs ---\n"); - - // Set custom output path - tree.set_output_path("nodes-4_calltree.bin"); - - if (tree.save_to_file()) { - printf("Binary format saved: nodes-4_calltree.bin\n"); + const std::string bin_path = "nodes-4_calltree.bin"; + const std::string arrow_path = "nodes-4_calltree.arrow"; + bool bin_ok = false, arrow_ok = false; + { + Pipeline pipeline; + auto save = make_task( + [&](CoroScope& scope) -> coro::CoroTask { + bin_ok = co_await save_binary(&scope, tree.internal_tree(), + bin_path); + arrow_ok = co_await save_arrow(&scope, tree.internal_tree(), + arrow_path); + }, + "save_call_tree"); + pipeline.set_source(save); + pipeline.set_destination(save); + pipeline.execute(); } - - // Save to JSON (Chrome Tracing format) - if (tree.save_to_json("nodes-4_calltree.pfw")) { - printf("JSON format saved: nodes-4_calltree.pfw (Chrome Tracing compatible)\n"); - } - - if (tree.print_depth_first_to_file("nodes-4_calltree_full.txt", 0)) { - printf("Full tree saved: nodes-4_calltree_full.txt\n"); - } - - if (tree.print_depth_first_to_file("nodes-4_calltree_summary.txt", 2)) { - printf("Summary (2 levels) saved: nodes-4_calltree_summary.txt\n"); - } - + printf("Binary format: %s -> %s\n", bin_path.c_str(), + bin_ok ? "saved" : "failed"); + printf("Arrow IPC: %s -> %s\n", arrow_path.c_str(), + arrow_ok ? "saved" : "failed"); + printf("\n=== Example completed successfully ===\n"); return 0; diff --git a/examples/call_tree_example3.cpp b/examples/call_tree_example3.cpp index c2cfb385..d6fd704a 100644 --- a/examples/call_tree_example3.cpp +++ b/examples/call_tree_example3.cpp @@ -4,12 +4,22 @@ */ #include +#include +#include +#include +#include +#include + #include #include #include #include using namespace dftracer::utils::call_tree; +using dftracer::utils::CoroScope; +using dftracer::utils::Pipeline; +using dftracer::utils::make_task; +namespace coro = dftracer::utils::coro; static void analyze_call_patterns(const std::vector& nodes) { printf("\n--- Call Pattern Analysis ---\n"); @@ -122,12 +132,26 @@ int main(int argc, char* argv[]) { // Also print the built-in statistics tree.print_statistics(); - // Save analysis results in JSON format for downstream processing + // Save analysis results. For Chrome Tracing JSON use the + // dftracer_call_tree binary; for fast C++ round-trip use save_binary; + // for Arrow tooling use save_arrow. Both run inside a Pipeline. printf("\nSaving analysis results...\n"); - if (tree.save_to_json("analysis_output.pfw")) { - printf("✓ JSON output saved to: analysis_output.pfw\n"); - printf(" This file can be imported into Chrome Tracing, Perfetto,\n"); - printf(" or analyzed with DFAnalyzer tools.\n"); + bool arrow_ok = false; + { + Pipeline pipeline; + auto save = make_task( + [&](CoroScope& scope) -> coro::CoroTask { + arrow_ok = co_await save_arrow(&scope, tree.internal_tree(), + "analysis_output.arrow"); + }, + "save_call_tree"); + pipeline.set_source(save); + pipeline.set_destination(save); + pipeline.execute(); + } + if (arrow_ok) { + printf("Arrow IPC output saved to: analysis_output.arrow\n"); + printf(" Readable by pyarrow / polars / dfanalyzer.\n"); } printf("\n=== Analysis complete ===\n"); diff --git a/flake.lock b/flake.lock deleted file mode 100644 index d9bd5088..00000000 --- a/flake.lock +++ /dev/null @@ -1,27 +0,0 @@ -{ - "nodes": { - "nixpkgs": { - "locked": { - "lastModified": 1758690382, - "narHash": "sha256-NY3kSorgqE5LMm1LqNwGne3ZLMF2/ILgLpFr1fS4X3o=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "e643668fd71b949c53f8626614b21ff71a07379d", - "type": "github" - }, - "original": { - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", - "type": "github" - } - }, - "root": { - "inputs": { - "nixpkgs": "nixpkgs" - } - } - }, - "root": "root", - "version": 7 -} diff --git a/flake.nix b/flake.nix deleted file mode 100644 index d0f8d887..00000000 --- a/flake.nix +++ /dev/null @@ -1,54 +0,0 @@ -{ - description = "DFTracer Utilities"; - - inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - - outputs = { self, nixpkgs }: - let - systems = [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" "aarch64-darwin" ]; - forAllSystems = f: - nixpkgs.lib.genAttrs systems (system: - f system (import nixpkgs { inherit system; })); - in - { - devShells = forAllSystems (system: pkgs: - let - gcc = pkgs.gcc14; - in { - default = pkgs.mkShell { - packages = [ gcc ] ++ (with pkgs; [ - cmake - ninja - pkg-config - pigz - lcov - openmpi - cmake-format - doxygen - graphviz - # valgrind-light - # sqlite - # zlib - # spdlog - gnutar - python311 - python312 - (python310.withPackages (p: [ - p.cython - p.setuptools - p.wheel - p.venvShellHook - ])) - ]); - - CC = "gcc"; - CXX = "g++"; - shellHook = '' - export CC=gcc - export CXX=g++ - unset SOURCE_DATE_EPOCH - ''; - }; - }); - }; -} diff --git a/include/dftracer/utils/call_tree/call_tree.h b/include/dftracer/utils/call_tree/call_tree.h index 8e88f915..cc79e6b2 100644 --- a/include/dftracer/utils/call_tree/call_tree.h +++ b/include/dftracer/utils/call_tree/call_tree.h @@ -62,8 +62,9 @@ struct CallTreeStats { // Forward declarations namespace internal { +class CallTree; class CallTreeImpl; -} +} // namespace internal /** * @brief Simple, clean API for working with call trees from DFTracer traces. @@ -114,65 +115,11 @@ class CallTree { */ bool generate(); - /** - * Print the call tree in depth-first order to stdout - * @param max_depth Maximum depth to print (0 = unlimited) - */ + /// Print depth-first tree to stdout. max_depth=0 means unlimited. void print_depth_first(int max_depth = 0) const; - /** - * Print the call tree to a file in depth-first order - * @param filename Output file path - * @param max_depth Maximum depth to print (0 = unlimited) - * @return true if successful, false otherwise - */ - bool print_depth_first_to_file(const std::string& filename, - int max_depth = 0) const; - - /** - * Get list of nodes in depth-first traversal order - * Returns simple node info structures (no complex internals) - * @return Vector of node information structures - */ std::vector get_nodes_depth_first() const; - /** - * Get the path where serialized tree would be saved - * @return Default output path based on input directory - */ - std::string get_output_path() const; - - /** - * Set custom output path for serialization - * @param path Custom output file path - */ - void set_output_path(const std::string& path); - - /** - * Serialize and save call tree to file in binary format - * @param filename Output file path (optional, uses get_output_path() if - * empty) - * @return true if successful, false otherwise - */ - bool save_to_file(const std::string& filename = "") const; - - /** - * Serialize and save call tree to file in JSON format (Chrome - * Tracing/Perfetto compatible) Follows DFTracer serialization format for - * compatibility with existing analysis tools - * @param filename Output file path (optional, uses get_output_path() with - * .pfw extension if empty) - * @return true if successful, false otherwise - */ - bool save_to_json(const std::string& filename = "") const; - - /** - * Load call tree from previously saved file - * @param filename Input file path - * @return true if successful, false otherwise - */ - bool load_from_file(const std::string& filename); - /** * Get aggregate statistics about the call tree * @return Statistics structure with aggregate information @@ -236,6 +183,12 @@ class CallTree { */ CallTreeNodeInfo get_node_by_id(std::uint64_t id) const; + /// Direct access to the underlying internal::CallTree. Use with the + /// save_binary / save_arrow coroutines in mpi/serializable.h. Returns a + /// reference; callers must keep the CallTree alive while it's in use. + internal::CallTree& internal_tree(); + const internal::CallTree& internal_tree() const; + private: std::unique_ptr impl_; }; diff --git a/include/dftracer/utils/call_tree/call_tree_mpi.h b/include/dftracer/utils/call_tree/call_tree_mpi.h index 8f112a3a..7f2b1d90 100644 --- a/include/dftracer/utils/call_tree/call_tree_mpi.h +++ b/include/dftracer/utils/call_tree/call_tree_mpi.h @@ -1,39 +1,16 @@ #ifndef DFTRACER_UTILS_CALL_TREE_MPI_H #define DFTRACER_UTILS_CALL_TREE_MPI_H -/** - * @file call_tree_mpi.h - * @brief Umbrella header for MPI-parallel call tree components - * - * This header provides convenient access to all MPI-related call tree - * functionality. Individual components can also be included separately - * from the mpi/ subdirectory for finer-grained control. - * - * Components included: - * - PIDIndexInfo: PID index information structure - * - SerializableCallNode/ProcessGraph: Serializable structures for MPI transfer - * - MPICallTreeConfig/Result: Configuration and result structures - * - CallGraphFileHeader: File header for call graph serialization - * - MPICallTreeBuilder: Main builder class for MPI-parallel call graph - * generation - * - MPIFilteredTraceReader: Filtered trace reader for specific PIDs - * - CallTreeBuildTask: Pipeline task for call tree building - * - serialization utilities: Read/write primitives for MPI serialization - */ +// MPI call-tree umbrella header. The engine is the coroutine-driven +// MPICallTreeBuilder; older per-component headers (serializable, +// file_header, build_task, filtered_reader, pid_index_info, serialization) +// were removed when the build/gather phases moved to ParallelWriter + +// merge_shards on Chrome Tracing JSON output. -// Include all MPI call tree components -#include -#include -#include -#include -#include -#include -#include -#include - -// Include specific internal call tree components needed by MPI #include #include #include +#include +#include #endif // DFTRACER_UTILS_CALL_TREE_MPI_H diff --git a/include/dftracer/utils/call_tree/internal/call_tree.h b/include/dftracer/utils/call_tree/internal/call_tree.h index 35511ab9..791388f4 100644 --- a/include/dftracer/utils/call_tree/internal/call_tree.h +++ b/include/dftracer/utils/call_tree/internal/call_tree.h @@ -116,6 +116,12 @@ class CallTree { */ void add_call(const ProcessKey& key, std::shared_ptr call); + // Moves every ProcessCallTree out of `other` into this tree. When both + // sides share a ProcessKey, calls/call_sequence from `other` are appended. + // `other` is left empty; intended for joining per-file CallTree fragments + // built concurrently into a single merged tree. + void merge_from(CallTree&& other); + /** * Build parent-child relationships after all traces loaded * Called by TraceReader after all data is loaded diff --git a/include/dftracer/utils/call_tree/internal/factory.h b/include/dftracer/utils/call_tree/internal/factory.h index 8598b02d..3e3cdf37 100644 --- a/include/dftracer/utils/call_tree/internal/factory.h +++ b/include/dftracer/utils/call_tree/internal/factory.h @@ -5,8 +5,7 @@ #include #include -#include -#include +#include #include namespace dftracer::utils::call_tree { @@ -46,10 +45,12 @@ class CallTreeFactory { * Create a new CallTreeNode from trace event data * The factory manages the lifecycle of created nodes */ - std::shared_ptr create_node( - std::uint64_t id, const std::string& name, const std::string& category, - std::uint64_t start_time, std::uint64_t duration, int level, - const std::unordered_map& args = {}); + std::shared_ptr create_node(std::uint64_t id, + std::string_view name, + std::string_view category, + std::uint64_t start_time, + std::uint64_t duration, int level, + ArgsMap args = {}); /** * Get total number of nodes created by this factory diff --git a/include/dftracer/utils/call_tree/internal/node.h b/include/dftracer/utils/call_tree/internal/node.h index 69b9c800..440d6e09 100644 --- a/include/dftracer/utils/call_tree/internal/node.h +++ b/include/dftracer/utils/call_tree/internal/node.h @@ -1,102 +1,68 @@ #ifndef DFTRACER_UTILS_CALL_TREE_INTERNAL_NODE_H #define DFTRACER_UTILS_CALL_TREE_INTERNAL_NODE_H +#include + #include #include -#include +#include #include namespace dftracer::utils::call_tree { namespace internal { -/** - * CallTreeNode - Represents a single function call in the trace - * Follows initialization pattern: - * 1. Constructor: Initialize internal variables, pointers to defaults (no - * allocation) - * 2. initialize(): Initialize state and perform allocations - * 3. cleanup(): Deallocate memory and clean up state - * 4. Destructor: Clear all state and reset variables - */ +using ArgsMap = dftracer::utils::utilities::composites::dft::ArgsMap; + +// name_ and category_ are non-owning views into a process-wide StringIntern +// pool. ArgsMap interns its own keys. class CallTreeNode { public: - /** - * Constructor - initializes internal variables and pointers to defaults - * No memory allocation or recursion - */ CallTreeNode(); - - /** - * Parameterized constructor for setting basic properties - */ - CallTreeNode(std::uint64_t id, const std::string& name, - const std::string& category); - - /** - * Destructor - clears all state of variables and resets them - */ + CallTreeNode(std::uint64_t id, std::string_view name, + std::string_view category); ~CallTreeNode(); - // Disable copy operations to prevent unintended copies CallTreeNode(const CallTreeNode&) = delete; CallTreeNode& operator=(const CallTreeNode&) = delete; - // Enable move operations for efficient transfers CallTreeNode(CallTreeNode&& other) noexcept; CallTreeNode& operator=(CallTreeNode&& other) noexcept; - /** - * Initialize the state of class private variables and allocations - * Called after constructor to set up the node with specific values - */ - void initialize(std::uint64_t id, const std::string& name, - const std::string& category, std::uint64_t start_time, + void initialize(std::uint64_t id, std::string_view name, + std::string_view category, std::uint64_t start_time, std::uint64_t duration, int level); - /** - * Cleanup - deallocates memory and cleans up state - * Called only at the end, ensures no memory leaks - */ void cleanup(); - // Getters std::uint64_t get_id() const { return id_; } - const std::string& get_name() const { return name_; } - const std::string& get_category() const { return category_; } + std::string_view get_name() const { return name_; } + std::string_view get_category() const { return category_; } std::uint64_t get_start_time() const { return start_time_; } std::uint64_t get_duration() const { return duration_; } int get_level() const { return level_; } std::uint64_t get_parent_id() const { return parent_id_; } - const std::unordered_map& get_args() const { - return args_; - } + const ArgsMap& get_args() const { return args_; } + ArgsMap& mut_args() { return args_; } const std::vector& get_children() const { return children_; } - // Setters void set_parent_id(std::uint64_t parent_id) { parent_id_ = parent_id; } void add_child(std::uint64_t child_id) { children_.push_back(child_id); } - void add_arg(const std::string& key, const std::string& value) { - args_[key] = value; - } - void set_args(const std::unordered_map& args) { - args_ = args; - } + void set_args(ArgsMap args) { args_ = std::move(args); } private: std::uint64_t id_; - std::string name_; - std::string category_; + std::string_view name_; + std::string_view category_; std::uint64_t start_time_; std::uint64_t duration_; int level_; std::uint64_t parent_id_; - std::unordered_map args_; + ArgsMap args_; std::vector children_; bool initialized_; bool cleaned_up_; }; -// Keep FunctionCall as alias for backward compatibility using FunctionCall = CallTreeNode; } // namespace internal diff --git a/include/dftracer/utils/call_tree/internal/trace_reader.h b/include/dftracer/utils/call_tree/internal/trace_reader.h index 80af58e7..1f858e58 100644 --- a/include/dftracer/utils/call_tree/internal/trace_reader.h +++ b/include/dftracer/utils/call_tree/internal/trace_reader.h @@ -1,82 +1,56 @@ #ifndef DFTRACER_UTILS_CALL_TREE_INTERNAL_TRACE_READER_H #define DFTRACER_UTILS_CALL_TREE_INTERNAL_TRACE_READER_H +#include +#include + +#include +#include #include +#include #include #include namespace dftracer::utils::call_tree { namespace internal { -// Forward declaration class CallTree; -/** - * Callback function type for processing traces - * Returns true to continue processing, false to stop - */ using TraceCallback = std::function; -/** - * TraceReader - Handles reading and parsing trace files - * Separates I/O concerns from the CallTree data structure - * Supports reading from single files, multiple files, or directories - */ class TraceReader { public: TraceReader() = default; ~TraceReader() = default; - /** - * Read trace file and populate call graph - * @param trace_file Path to trace log file - * @param graph CallTree to populate - * @return true if successful, false otherwise - */ bool read(const std::string& trace_file, CallTree& graph); - - /** - * Read multiple trace files and populate call graph - * Each file may contain traces from different nodes/processes - * @param trace_files Vector of paths to trace files - * @param graph CallTree to populate - * @return true if all files read successfully, false otherwise - */ bool read_multiple(const std::vector& trace_files, CallTree& graph); - - /** - * Read all trace files matching pattern from a directory - * @param directory Path to directory containing trace files - * @param pattern Glob pattern for trace files (e.g., "*.pfw") - * @param graph CallTree to populate - * @return true if successful, false otherwise - */ bool read_directory(const std::string& directory, const std::string& pattern, CallTree& graph); - /** - * Process a single JSON trace line - * Made public for MPI-based filtered readers - * @param line JSON line from trace file - * @param graph CallTree to add data to - * @return true if successful, false otherwise - */ + bool process_trace_line( + dftracer::utils::utilities::common::json::JsonParser& parser, + CallTree& graph); bool process_trace_line(const std::string& line, CallTree& graph); +}; - private: - /** - * Detect file format and use appropriate reader - * Returns true if read with Reader API, false if need fallback - */ - bool read_with_reader(const std::string& trace_file, CallTree& graph); - - /** - * Fallback to direct file reading for plain text files - */ - bool read_direct(const std::string& trace_file, CallTree& graph); +struct ReadCounts { + std::size_t processed = 0; + std::size_t filtered = 0; }; +// allowed_pids == nullptr disables filtering. +ReadCounts read_trace_file( + const std::string& trace_file, CallTree& graph, + const std::set* allowed_pids = nullptr); + +// Coroutine entry point. Drives utilities::reader::TraceReader::read_json +// inline so callers can fan out via CoroScope::spawn over multiple files. +coro::CoroTask read_trace_file_async( + std::string trace_file, CallTree* graph, + const std::set* allowed_pids = nullptr); + } // namespace internal } // namespace dftracer::utils::call_tree diff --git a/include/dftracer/utils/call_tree/json_serializer.h b/include/dftracer/utils/call_tree/json_serializer.h index 74821370..34af60c2 100644 --- a/include/dftracer/utils/call_tree/json_serializer.h +++ b/include/dftracer/utils/call_tree/json_serializer.h @@ -7,7 +7,6 @@ #include #include #include -#include namespace dftracer::utils::call_tree { namespace internal { @@ -88,9 +87,7 @@ class JsonSerializer { * @param stream Output string stream * @return True if metadata was present, false otherwise */ - bool convert_args_to_json( - const std::unordered_map& args, - std::stringstream& stream); + bool convert_args_to_json(const ArgsMap& args, std::stringstream& stream); std::string hostname_hash_; }; diff --git a/include/dftracer/utils/call_tree/mpi/build_task.h b/include/dftracer/utils/call_tree/mpi/build_task.h deleted file mode 100644 index a9a44c65..00000000 --- a/include/dftracer/utils/call_tree/mpi/build_task.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef DFTRACER_UTILS_CALL_TREE_MPI_BUILD_TASK_H -#define DFTRACER_UTILS_CALL_TREE_MPI_BUILD_TASK_H - -/** - * @file build_task.h - * @brief Pipeline-based call graph builder task - */ - -#include -#include - -#include -#include -#include -#include - -namespace dftracer::utils::call_tree { - -/** - * Pipeline-based call graph builder task - * Input: vector of trace files - * Output: internal::ProcessCallTree for assigned PIDs - */ -struct CallTreeBuildTask { - std::set pids; - std::vector trace_files; - - internal::ProcessCallTree execute(internal::CallTree& tree); -}; - -} // namespace dftracer::utils::call_tree - -#endif // DFTRACER_UTILS_CALL_TREE_MPI_BUILD_TASK_H diff --git a/include/dftracer/utils/call_tree/mpi/builder.h b/include/dftracer/utils/call_tree/mpi/builder.h index 2ea69e1f..791d0936 100644 --- a/include/dftracer/utils/call_tree/mpi/builder.h +++ b/include/dftracer/utils/call_tree/mpi/builder.h @@ -1,21 +1,14 @@ #ifndef DFTRACER_UTILS_CALL_TREE_MPI_BUILDER_H #define DFTRACER_UTILS_CALL_TREE_MPI_BUILDER_H -/** - * @file builder.h - * @brief MPICallTreeBuilder - Main class for MPI-parallel call graph generation - */ - #include #include +#include #include -#include -#include -#include -#include +#include +#include #include -#include #include #include #include @@ -23,172 +16,66 @@ namespace dftracer::utils::call_tree { -// Type alias for indexer shared pointer -using IndexerPtr = - std::shared_ptr; - -/** - * MPICallTreeBuilder - Main class for MPI-parallel call graph generation - * - * Usage: - * 1. Create builder with config - * 2. Call discover_pids() to find all PIDs in trace files - * 3. Call build() to generate call graphs in parallel - * 4. Call gather() to collect all graphs to all ranks (all-to-all) - * 5. Call save() to write to file - * - * Follows initialization pattern: - * 1. Constructor: Initialize internal variables (no allocation) - * 2. initialize(): Set up MPI, index files, discover PIDs - * 3. build(): Generate call graphs using pipeline - * 4. gather(): All-to-all MPI communication - * 5. save()/load(): File I/O - * 6. cleanup(): Deallocate memory - */ +// MPI-parallel call tree engine. Each method is a coroutine driven by the +// caller's pipeline; this class owns no Pipeline of its own. Phases follow +// the dftracer_aggregator_mpi pattern: +// +// discover_pids_async : cooperative PID pre-scan + allgather + round-robin +// assign so each pid is owned by exactly one rank. +// build_async : per-file CoroScope fan-out, pid-filtered ingest, +// merge of per-file fragments into a local tree. +// hierarchy_async : per-process CoroScope fan-out (each PID lives on +// one rank so no cross-rank dependency). +// write_async : per-rank Chrome Tracing JSON shard via sharded +// ParallelWriter (io_backend driven). +// merge_async : rank 0 concatenates shards into the final output +// via fileio::parallel::merge_shards. class MPICallTreeBuilder { public: - /** - * Constructor - initializes with configuration - */ explicit MPICallTreeBuilder(const MPICallTreeConfig& config); - - /** - * Destructor - */ ~MPICallTreeBuilder(); - // Disable copy MPICallTreeBuilder(const MPICallTreeBuilder&) = delete; MPICallTreeBuilder& operator=(const MPICallTreeBuilder&) = delete; - - // Enable move MPICallTreeBuilder(MPICallTreeBuilder&&) noexcept; MPICallTreeBuilder& operator=(MPICallTreeBuilder&&) noexcept; - /** - * Initialize MPI and internal structures - * Must be called after MPI_Init - */ - void initialize(); - - /** - * Cleanup and release resources - */ - void cleanup(); - - /** - * Add trace files to process - * @param files Vector of file paths - */ void add_trace_files(const std::vector& files); - - /** - * Add trace files from directory - * @param directory Path to directory - * @param pattern File pattern (e.g., "*.pfw.gz") - */ void add_trace_directory(const std::string& directory, const std::string& pattern = "*.pfw.gz"); - /** - * Phase 1: Discover all PIDs and build index - * Each MPI rank discovers PIDs from the trace files - * Results are gathered and PIDs are distributed - * @return Map of PID to index info - */ - std::map discover_pids(); - - /** - * Phase 2: Build call graphs for assigned PIDs - * Uses pipeline for parallel processing within rank - * @return Result containing success status and statistics - */ - MPICallGraphResult build(); - - /** - * Phase 3: All-to-all communication to share graphs - * After this, all ranks have identical copies of all call graphs - * @return true if successful - */ - bool gather(); - - /** - * Save the global call graph to file - * @param filename Output file path - * @return true if successful - */ - bool save(const std::string& filename) const; + coro::CoroTask discover_pids(CoroScope* scope); + coro::CoroTask build(CoroScope* scope); + coro::CoroTask hierarchy(CoroScope* scope); + coro::CoroTask write(CoroScope* scope, std::string output_path, + std::string staging_dir, bool gzip); + coro::CoroTask merge(std::string output_path, std::string staging_dir, + bool gzip, bool keep_staging); - /** - * Load call tree from file (static method) - * @param filename Input file path - * @return Loaded call tree or nullptr on error - */ - static std::unique_ptr load( - const std::string& filename); + int rank() const { return rank_; } + int world_size() const { return world_size_; } - /** - * Get the generated call tree - * @return Reference to the call tree - */ - internal::CallTree& get_call_tree() { return *call_tree_; } - const internal::CallTree& get_call_tree() const { return *call_tree_; } - - /** - * Get MPI rank (delegates to MPIUtils singleton) - */ - int get_rank() const { return mpi::MPIUtils::instance().get_rank(); } - - /** - * Get MPI world size (delegates to MPIUtils singleton) - */ - int get_world_size() const { - return mpi::MPIUtils::instance().get_world_size(); - } - - /** - * Get PIDs assigned to this rank - */ - const std::set& get_assigned_pids() const { + const std::vector& trace_files() const { return trace_files_; } + const std::set& all_pids() const { return all_pids_; } + const std::set& assigned_pids() const { return assigned_pids_; } - - /** - * Print summary statistics - */ - void print_summary() const; + internal::CallTree& local_tree() { return *call_tree_; } + const internal::CallTree& local_tree() const { return *call_tree_; } private: MPICallTreeConfig config_; std::unique_ptr call_tree_; - // File tracking - std::vector trace_files_; - std::map indexers_; + int rank_ = 0; + int world_size_ = 1; - // PID management - std::map pid_index_map_; + std::vector trace_files_; + std::set all_pids_; std::set assigned_pids_; - std::vector all_pids_; - - // State flags - bool initialized_ = false; - bool pids_discovered_ = false; - bool graphs_built_ = false; - bool graphs_gathered_ = false; - - // Internal methods - void create_indexer(const std::string& trace_file); - std::set scan_file_for_pids(const std::string& trace_file); - bool read_traces_for_pids(const std::vector& files, - const std::set& pids); - SerializableProcessGraph convert_to_serializable( - const internal::ProcessCallTree& graph) const; - void merge_from_serializable(const SerializableProcessGraph& serializable); + std::vector my_process_keys_; - // Internal MPI helpers - void distribute_pids(); - bool alltoall_graphs(); + std::string my_shard_path_; }; } // namespace dftracer::utils::call_tree diff --git a/include/dftracer/utils/call_tree/mpi/file_header.h b/include/dftracer/utils/call_tree/mpi/file_header.h deleted file mode 100644 index fbf459f1..00000000 --- a/include/dftracer/utils/call_tree/mpi/file_header.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef DFTRACER_UTILS_CALL_TREE_MPI_FILE_HEADER_H -#define DFTRACER_UTILS_CALL_TREE_MPI_FILE_HEADER_H - -/** - * @file file_header.h - * @brief File header structure for persisted call graph files - */ - -#include -#include - -namespace dftracer::utils::call_tree { - -/** - * File header for persisted call graph - */ -struct CallGraphFileHeader { - static constexpr char MAGIC[8] = {'D', 'F', 'T', 'C', 'G', 'R', 'P', 'H'}; - static constexpr std::uint32_t VERSION = 1; - - char magic[8]; - std::uint32_t version; - std::uint32_t num_process_graphs; - std::uint64_t data_offset; - std::uint64_t total_events; - - CallGraphFileHeader() - : version(VERSION), - num_process_graphs(0), - data_offset(0), - total_events(0) { - std::memcpy(magic, MAGIC, sizeof(MAGIC)); - } - - bool is_valid() const { - return std::memcmp(magic, MAGIC, sizeof(MAGIC)) == 0 && - version == VERSION; - } -}; - -} // namespace dftracer::utils::call_tree - -#endif // DFTRACER_UTILS_CALL_TREE_MPI_FILE_HEADER_H diff --git a/include/dftracer/utils/call_tree/mpi/filtered_reader.h b/include/dftracer/utils/call_tree/mpi/filtered_reader.h deleted file mode 100644 index 249db978..00000000 --- a/include/dftracer/utils/call_tree/mpi/filtered_reader.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef DFTRACER_UTILS_CALL_TREE_MPI_FILTERED_READER_H -#define DFTRACER_UTILS_CALL_TREE_MPI_FILTERED_READER_H - -/** - * @file filtered_reader.h - * @brief Filtered trace reader that only processes events for specific PIDs - */ - -#include - -#include -#include -#include -#include - -namespace dftracer::utils::call_tree { - -/** - * Filtered trace reader that only processes events for specific PIDs - * Uses the indexer to efficiently skip to relevant sections - */ -class MPIFilteredTraceReader { - public: - explicit MPIFilteredTraceReader( - const std::set& allowed_pids); - - /** - * Read trace file and populate call graph - * Only processes events for allowed PIDs - */ - bool read(const std::string& trace_file, internal::CallTree& graph); - - /** - * Read with indexer for efficient access - */ - bool read_with_indexer(const std::string& trace_file, - const std::string& index_file, - internal::CallTree& graph); - - /** - * Read multiple files - */ - bool read_multiple(const std::vector& trace_files, - internal::CallTree& graph); - - /** - * Get count of processed events - */ - std::size_t get_processed_count() const { return processed_count_; } - - /** - * Get count of filtered (skipped) events - */ - std::size_t get_filtered_count() const { return filtered_count_; } - - private: - std::set allowed_pids_; - std::size_t processed_count_ = 0; - std::size_t filtered_count_ = 0; -}; - -} // namespace dftracer::utils::call_tree - -#endif // DFTRACER_UTILS_CALL_TREE_MPI_FILTERED_READER_H diff --git a/include/dftracer/utils/call_tree/mpi/pid_index_info.h b/include/dftracer/utils/call_tree/mpi/pid_index_info.h deleted file mode 100644 index 0539744c..00000000 --- a/include/dftracer/utils/call_tree/mpi/pid_index_info.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef DFTRACER_UTILS_CALL_TREE_MPI_PID_INDEX_INFO_H -#define DFTRACER_UTILS_CALL_TREE_MPI_PID_INDEX_INFO_H - -/** - * @file pid_index_info.h - * @brief Structure to hold PID index information from gzip indexer - */ - -#include -#include - -namespace dftracer::utils::call_tree { - -/** - * Structure to hold PID index information from gzip indexer - * Maps each PID to its starting line in the trace file - */ -struct PIDIndexInfo { - std::uint32_t pid; - std::uint64_t start_line; - std::uint64_t end_line; - std::uint64_t event_count; - std::string source_file; - - PIDIndexInfo() : pid(0), start_line(0), end_line(0), event_count(0) {} - PIDIndexInfo(std::uint32_t p, std::uint64_t sl, std::uint64_t el, - std::uint64_t ec, const std::string& sf) - : pid(p), - start_line(sl), - end_line(el), - event_count(ec), - source_file(sf) {} -}; - -} // namespace dftracer::utils::call_tree - -#endif // DFTRACER_UTILS_CALL_TREE_MPI_PID_INDEX_INFO_H diff --git a/include/dftracer/utils/call_tree/mpi/serializable.h b/include/dftracer/utils/call_tree/mpi/serializable.h index 8b1ed600..5e0f69a2 100644 --- a/include/dftracer/utils/call_tree/mpi/serializable.h +++ b/include/dftracer/utils/call_tree/mpi/serializable.h @@ -1,53 +1,44 @@ #ifndef DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZABLE_H #define DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZABLE_H -/** - * @file serializable.h - * @brief Serializable structures for MPI transfer of call graph data - */ - -#include +// Two save/load formats for in-memory call trees: +// +// save_binary / load_binary -- compact custom format with a string +// dictionary (name/category/arg keys/string values share storage) and +// typed args (preserves int/uint/double/bool vs flattening to strings). +// Header is fixed-size; body lays out a global string table followed +// by ProcessCallTree records. +// +// save_arrow / load_arrow -- Arrow IPC (.arrow) with zstd buffer-level +// compression. Columnar layout with dictionary-encoded name/category; +// readable by pyarrow / polars / nanoarrow. Best for analysis tooling +// that already speaks Arrow. + +#include +#include +#include #include +#include #include -#include -#include namespace dftracer::utils::call_tree { -/** - * Serializable call graph node for MPI transfer - */ -struct SerializableCallNode { - std::uint64_t id; - std::string name; - std::string category; - std::uint64_t start_time; - std::uint64_t duration; - int level; - std::uint64_t parent_id; - std::vector children; - std::unordered_map args; - - // Serialization to bytes - std::vector serialize() const; - static SerializableCallNode deserialize(const char* data, size_t& offset); -}; - -/** - * Serializable process call graph for MPI transfer - */ -struct SerializableProcessGraph { - internal::ProcessKey key; - std::vector nodes; - std::vector root_calls; - std::vector call_sequence; - - // Serialization to bytes - std::vector serialize() const; - static SerializableProcessGraph deserialize(const char* data, - size_t& offset); -}; +inline constexpr char CALLTREE_BINARY_MAGIC[8] = {'D', 'F', 'T', 'C', + 'G', 'R', 'P', '2'}; +inline constexpr std::uint32_t CALLTREE_BINARY_VERSION = 2; + +coro::CoroTask save_binary(CoroScope* scope, + const internal::CallTree& tree, + std::string output_path); +coro::CoroTask> load_binary( + CoroScope* scope, std::string input_path); + +coro::CoroTask save_arrow(CoroScope* scope, + const internal::CallTree& tree, + std::string output_path); +coro::CoroTask> load_arrow( + CoroScope* scope, std::string input_path); } // namespace dftracer::utils::call_tree diff --git a/include/dftracer/utils/call_tree/mpi/serialization.h b/include/dftracer/utils/call_tree/mpi/serialization.h deleted file mode 100644 index b766f9d7..00000000 --- a/include/dftracer/utils/call_tree/mpi/serialization.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZATION_H -#define DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZATION_H - -/** - * @file serialization.h - * @brief Utility functions for serialization of MPI data - */ - -#include -#include -#include - -namespace dftracer::utils::call_tree { -namespace serialization { - -// Write primitives -void write_uint32(std::vector& buffer, std::uint32_t value); -void write_uint64(std::vector& buffer, std::uint64_t value); -void write_int(std::vector& buffer, int value); -void write_string(std::vector& buffer, const std::string& str); - -// Read primitives -std::uint32_t read_uint32(const char* data, size_t& offset); -std::uint64_t read_uint64(const char* data, size_t& offset); -int read_int(const char* data, size_t& offset); -std::string read_string(const char* data, size_t& offset); - -} // namespace serialization -} // namespace dftracer::utils::call_tree - -#endif // DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZATION_H diff --git a/include/dftracer/utils/core/common/buffer_pool.h b/include/dftracer/utils/core/common/buffer_pool.h index d265f037..591ea3c1 100644 --- a/include/dftracer/utils/core/common/buffer_pool.h +++ b/include/dftracer/utils/core/common/buffer_pool.h @@ -1,11 +1,11 @@ #ifndef DFTRACER_UTILS_CORE_COMMON_BUFFER_POOL_H #define DFTRACER_UTILS_CORE_COMMON_BUFFER_POOL_H +#include + #include #include -#include #include -#include namespace dftracer::utils { @@ -24,10 +24,6 @@ struct NoOpReset { /** * @brief Thread-safe typed buffer pool. Zero allocations after warmup. * - * Buffers are never dropped. Released buffers are always kept for reuse. - * The init factory is only called when the pool is empty (during warmup - * or under unexpected load). - * * @tparam T Buffer type. Must support move semantics. */ template @@ -38,27 +34,19 @@ class BufferPool { virtual void release(T buf) = 0; }; -/** - * @brief Concrete buffer pool with typed Init and Reset callables. - * - * Init and Reset are stored by value to avoid std::function overhead. - */ template class BufferPoolImpl : public BufferPool { public: BufferPoolImpl(std::size_t capacity, Init init, Reset reset = Reset{}) - : init_(std::move(init)), reset_(std::move(reset)) { - pool_.reserve(capacity); + : queue_(capacity), init_(std::move(init)), reset_(std::move(reset)) { for (std::size_t i = 0; i < capacity; ++i) { - pool_.push_back(init_()); + queue_.enqueue(init_()); } } T acquire() override { - std::lock_guard lock(mu_); - if (!pool_.empty()) { - T item = std::move(pool_.back()); - pool_.pop_back(); + T item; + if (queue_.try_dequeue(item)) { return item; } return init_(); @@ -66,13 +54,11 @@ class BufferPoolImpl : public BufferPool { void release(T buf) override { reset_(buf); - std::lock_guard lock(mu_); - pool_.push_back(std::move(buf)); + queue_.enqueue(std::move(buf)); } private: - std::mutex mu_; - std::vector pool_; + moodycamel::ConcurrentQueue queue_; Init init_; Reset reset_; }; diff --git a/include/dftracer/utils/core/common/constants.h b/include/dftracer/utils/core/common/constants.h index 72e27dcf..44233c08 100644 --- a/include/dftracer/utils/core/common/constants.h +++ b/include/dftracer/utils/core/common/constants.h @@ -19,8 +19,6 @@ static constexpr std::size_t INFLATE_BUFFER_SIZE = 262144; // 256KB #endif static constexpr std::uint64_t DEFAULT_CHECKPOINT_SIZE = 32 * 1024 * 1024; // 32MB -static constexpr std::size_t DEFAULT_INDEX_SIZE_THRESHOLD = - 1 * 1024 * 1024; // 1MB extern const char* const& SQL_SCHEMA; inline const char* EXTENSION = ".dftindex"; } // namespace indexer @@ -41,7 +39,6 @@ static constexpr std::size_t FILE_IO_BUFFER_SIZE = #define DFTRACER_UTILS_ZLIB_WINDOW_SIZE 32768 #define DFTRACER_UTILS_ZLIB_GZIP_WINDOW_BITS 31 #define DFTRACER_UTILS_DEFAULT_CHECKPOINT_SIZE (32 * 1024 * 1024) -#define DFTRACER_UTILS_DEFAULT_INDEX_SIZE_THRESHOLD (1 * 1024 * 1024) #define DFTRACER_UTILS_DEFAULT_BUFFER_SIZE 65536 #define DFTRACER_UTILS_SKIP_BUFFER_SIZE 131072 #define DFTRACER_UTILS_FILE_IO_BUFFER_SIZE 262144 diff --git a/include/dftracer/utils/core/common/memory_budget.h b/include/dftracer/utils/core/common/memory_budget.h new file mode 100644 index 00000000..beca660a --- /dev/null +++ b/include/dftracer/utils/core/common/memory_budget.h @@ -0,0 +1,32 @@ +#ifndef DFTRACER_UTILS_CORE_COMMON_MEMORY_BUDGET_H +#define DFTRACER_UTILS_CORE_COMMON_MEMORY_BUDGET_H + +#include +#include + +namespace dftracer::utils { + +static constexpr std::size_t DEFAULT_MEMORY_BUDGET_FRACTION_PERCENT = 50; +static constexpr std::size_t MIN_MEMORY_BUDGET_BYTES = 64 * 1024 * 1024; + +static constexpr std::size_t PER_FILE_EXPANSION_FACTOR = 24; +static constexpr std::size_t MIN_PER_FILE_PEAK_BYTES = 64ULL * 1024 * 1024; +static constexpr std::size_t MAX_PER_FILE_PEAK_BYTES = + 16ULL * 1024 * 1024 * 1024; +static constexpr std::size_t PER_FILE_SAMPLE_LIMIT = 1024; + +std::size_t detect_available_memory(); +std::size_t compute_memory_budget(std::size_t user_override_bytes = 0); +std::size_t compute_channel_capacity(std::size_t memory_budget_bytes, + std::size_t estimated_batch_bytes, + std::size_t num_workers); +std::size_t compute_file_batch_size(std::size_t memory_budget_bytes, + std::size_t estimated_file_bytes, + std::size_t min_files = 4); + +std::size_t estimate_per_file_bytes(const std::vector& file_sizes, + std::size_t user_override_bytes = 0); + +} // namespace dftracer::utils + +#endif // DFTRACER_UTILS_CORE_COMMON_MEMORY_BUDGET_H diff --git a/include/dftracer/utils/core/common/object_pool.h b/include/dftracer/utils/core/common/object_pool.h index 8ceefa0e..d0de1157 100644 --- a/include/dftracer/utils/core/common/object_pool.h +++ b/include/dftracer/utils/core/common/object_pool.h @@ -7,28 +7,11 @@ #include #include #include -#include #include -#include namespace dftracer::utils { -/** - * @brief Lock-free LIFO stack (Treiber stack) with ABA-safe tagged pointers. - * - * Intrusive: the `next` pointer is stored in the first 8 bytes of the - * block itself (valid since blocks are at least sizeof(void*) bytes). - * - * ABA protection: x86-64 uses 48-bit virtual addresses. A 16-bit - * generation counter is packed into the upper bits of a 64-bit atomic. - * - * Reference: Treiber, R.K. (1986) "Systems Programming: Coping with - * Parallelism", IBM Technical Report. - */ class TreiberStack { - // Intrusive next-pointer stored in the first sizeof(void*) bytes of - // freed blocks. Accessed via atomic_ref so TSAN can track the - // happens-before relationship through the CAS on head_. static void store_next(void* block, void* next) noexcept { std::atomic_ref(*reinterpret_cast(block)) .store(next, std::memory_order_release); @@ -81,7 +64,6 @@ class TreiberStack { static void* unpack_ptr(std::uint64_t packed) noexcept { auto raw = static_cast(packed & PTR_MASK); - // Sign-extend bit 47 for canonical x86-64 addresses if (raw & (1ULL << 47)) { raw |= ~PTR_MASK; } @@ -93,18 +75,6 @@ class TreiberStack { } }; -/** - * @brief Thread-safe, lock-free object pool with size-bucketed freelists. - * - * Uses TreiberStack (LIFO) per size class. After warmup, allocations are - * zero-malloc: freed blocks are recycled immediately. - * - * Usage: - * @code - * void* p = ObjectPool::instance().allocate(256); - * ObjectPool::instance().deallocate(p, 256); - * @endcode - */ class ObjectPool { public: static ObjectPool& instance() { @@ -113,15 +83,20 @@ class ObjectPool { } void* allocate(std::size_t size) { - auto& stack = get_stack(size); - void* block = stack.pop(); + auto* stack = get_stack(size); + if (!stack) return ::operator new(size); + void* block = stack->pop(); if (block) return block; return ::operator new(size); } void deallocate(void* block, std::size_t size) { - auto& stack = get_stack(size); - stack.push(block); + auto* stack = get_stack(size); + if (!stack) { + ::operator delete(block); + return; + } + stack->push(block); } ObjectPool(const ObjectPool&) = delete; @@ -131,16 +106,16 @@ class ObjectPool { ObjectPool() = default; ~ObjectPool() { - // Drain all fast buckets for (auto& stack : fast_buckets_) { while (void* block = stack.pop()) { ::operator delete(block); } } - // Drain all slow buckets - for (auto& [_, stack] : slow_buckets_) { - while (void* block = stack.pop()) { - ::operator delete(block); + for (auto& slot : slow_table_) { + if (slot.bucket.load(std::memory_order_relaxed) != 0) { + while (void* block = slot.stack.pop()) { + ::operator delete(block); + } } } } @@ -149,18 +124,45 @@ class ObjectPool { static constexpr std::size_t MAX_FAST_SIZE = 4096; static constexpr std::size_t NUM_FAST_BUCKETS = MAX_FAST_SIZE / ALIGNMENT; + static constexpr std::size_t SLOW_TABLE_SIZE = 256; + static constexpr std::size_t SLOW_TABLE_MASK = SLOW_TABLE_SIZE - 1; + std::array fast_buckets_; - std::mutex slow_mutex_; - std::unordered_map slow_buckets_; + struct SlowSlot { + std::atomic bucket{0}; + TreiberStack stack; + }; + std::array slow_table_; - TreiberStack& get_stack(std::size_t size) { + TreiberStack* get_stack(std::size_t size) { std::size_t bucket = (size + ALIGNMENT - 1) / ALIGNMENT; if (bucket > 0 && bucket <= NUM_FAST_BUCKETS) { - return fast_buckets_[bucket - 1]; + return &fast_buckets_[bucket - 1]; + } + return find_slow_stack(bucket); + } + + TreiberStack* find_slow_stack(std::size_t bucket) { + auto h = bucket; + for (std::size_t i = 0; i < SLOW_TABLE_SIZE; ++i) { + auto idx = (h + i) & SLOW_TABLE_MASK; + auto& slot = slow_table_[idx]; + auto existing = slot.bucket.load(std::memory_order_acquire); + if (existing == bucket) return &slot.stack; + if (existing == 0) { + std::size_t expected = 0; + if (slot.bucket.compare_exchange_strong( + expected, bucket, std::memory_order_release, + std::memory_order_acquire)) { + return &slot.stack; + } + if (slot.bucket.load(std::memory_order_acquire) == bucket) { + return &slot.stack; + } + } } - std::lock_guard lock(slow_mutex_); - return slow_buckets_[bucket]; + return nullptr; } }; diff --git a/include/dftracer/utils/core/common/string_intern.h b/include/dftracer/utils/core/common/string_intern.h index 5cbfa740..976c2edc 100644 --- a/include/dftracer/utils/core/common/string_intern.h +++ b/include/dftracer/utils/core/common/string_intern.h @@ -1,98 +1,241 @@ #ifndef DFTRACER_UTILS_CORE_COMMON_STRING_INTERN_H #define DFTRACER_UTILS_CORE_COMMON_STRING_INTERN_H -#include - +#include #include +#include +#include #include -#include #include #include -#include -#include namespace dftracer::utils { -/** - * @brief Thread-safe string interning table. - * - * Stores each unique string once and returns an integer ID. - * Lookups by string_view avoid allocation on cache hit. - * IDs are stable for the lifetime of the table. - * - * Usage: - * @code - * StringIntern intern; - * uint32_t id = intern.get_or_insert("POSIX"); // first call: stores string - * uint32_t id2 = intern.get_or_insert("POSIX"); // cache hit: no alloc - * assert(id == id2); - * assert(intern.resolve(id) == "POSIX"); - * @endcode - */ class StringIntern { public: - StringIntern() = default; + static constexpr std::size_t FAST_CAPACITY = 1u << 20; + + StringIntern() + : buckets_(std::make_unique[]>(BUCKET_COUNT)), + fast_(std::make_unique[]>( + FAST_CAPACITY)) { + for (std::size_t i = 0; i < BUCKET_COUNT; ++i) { + buckets_[i].store(nullptr, std::memory_order_relaxed); + } + } + + ~StringIntern() { + for (std::size_t i = 0; i < BUCKET_COUNT; ++i) { + auto* node = buckets_[i].load(std::memory_order_relaxed); + while (node) { + auto* next = node->next.load(std::memory_order_relaxed); + delete node; + node = next; + } + } + } - // Non-copyable, non-movable (shared_mutex is not movable) StringIntern(const StringIntern&) = delete; StringIntern& operator=(const StringIntern&) = delete; StringIntern(StringIntern&&) = delete; StringIntern& operator=(StringIntern&&) = delete; - /** - * @brief Intern a string. Returns its unique ID. - * Thread-safe. Uses shared_mutex: concurrent reads, exclusive writes. - * Lookups use string_view (no allocation on cache hit). - */ std::uint32_t get_or_insert(std::string_view sv) { - // Fast path: read lock, check if already interned - { - std::shared_lock lock(mutex_); - auto it = str_to_id_.find(sv); - if (it != str_to_id_.end()) return it->second; + const auto h = hash(sv); + const auto bucket = h & BUCKET_MASK; + + // Lock-free lookup + auto* node = buckets_[bucket].load(std::memory_order_acquire); + while (node) { + if (node->hash == h && node->str == sv) { + return node->id; + } + node = node->next.load(std::memory_order_acquire); + } + + // Rare: new string; take mutex + std::lock_guard lock(insert_mutex_); + + // Re-check under lock (another thread may have inserted) + node = buckets_[bucket].load(std::memory_order_acquire); + while (node) { + if (node->hash == h && node->str == sv) { + return node->id; + } + node = node->next.load(std::memory_order_acquire); + } + + std::uint32_t id; + if (deterministic_ids_.load(std::memory_order_acquire)) { + // Deterministic-hash id so the same string maps to the same id + // across processes. Mask off top bit + clamp into FAST_CAPACITY + // so `resolve()` hits the fast path. Collisions (different + // strings -> same id) are bucket-chained on insert but + // `resolve(id)` returns the first-inserted string for that id. + id = static_cast(h & (FAST_CAPACITY - 1)); + } else { + id = static_cast( + num_strings_.load(std::memory_order_relaxed)); + } + auto* new_node = new Node{std::string(sv), h, id, {}}; + new_node->next.store(buckets_[bucket].load(std::memory_order_relaxed), + std::memory_order_relaxed); + + if (id < FAST_CAPACITY) { + // Respect "first-inserted wins" when a collision maps two + // different strings to the same deterministic id: only set + // fast_[id] if the slot is still empty. + const std::string* expected = nullptr; + fast_[id].compare_exchange_strong(expected, &new_node->str, + std::memory_order_release, + std::memory_order_relaxed); } - // Slow path: write lock, insert new string - std::unique_lock lock(mutex_); - // Double-check after acquiring write lock - auto [it, inserted] = str_to_id_.try_emplace( - std::string(sv), static_cast(id_to_str_.size())); - if (inserted) { - id_to_str_.push_back(it->first); + + // Publish to bucket; all prior stores (node fields, fast_[]) + // are visible to readers via this release. + buckets_[bucket].store(new_node, std::memory_order_release); + + if (!deterministic_ids_.load(std::memory_order_acquire)) { + // Sequential id path: advance counter past the id we just + // handed out so size() stays monotonic. + num_strings_.store(static_cast(id) + 1, + std::memory_order_release); + } else { + // Deterministic id path: `size()` is a weak estimate of + // distinct strings; bump if this id is the highest seen. + std::size_t cur = num_strings_.load(std::memory_order_relaxed); + const std::size_t need = static_cast(id) + 1; + while (cur < need && !num_strings_.compare_exchange_weak( + cur, need, std::memory_order_release, + std::memory_order_relaxed)) { + } + } + + return id; + } + + /// Insert or look up a string at a specific id (for loading a persisted + /// dictionary where ids must be preserved). If the id already holds a + /// different string, the existing binding wins (caller error -> ignored). + /// Safe to call concurrently with other inserts; must be called before + /// any `resolve(id)` at that id. + void insert_at_id(std::uint32_t id, std::string_view sv) { + const auto h = hash(sv); + const auto bucket = h & BUCKET_MASK; + + std::lock_guard lock(insert_mutex_); + + // If the id already has a string in fast_, nothing to do. + if (id < FAST_CAPACITY) { + if (fast_[id].load(std::memory_order_acquire) != nullptr) { + return; + } + } + + // Also avoid inserting a second node for the same string (would leave + // the older node referenced by the bucket chain pointing at a stale + // id, confusing get_or_insert which returns the first match). + auto* node = buckets_[bucket].load(std::memory_order_acquire); + while (node) { + if (node->hash == h && node->str == sv) { + // String already interned under a different id; point fast_[id] + // at it so resolve(id) returns something valid. + if (id < FAST_CAPACITY) { + fast_[id].store(&node->str, std::memory_order_release); + } + if (static_cast(id) + 1 > + num_strings_.load(std::memory_order_relaxed)) { + num_strings_.store(static_cast(id) + 1, + std::memory_order_release); + } + return; + } + node = node->next.load(std::memory_order_acquire); + } + + auto* new_node = new Node{std::string(sv), h, id, {}}; + new_node->next.store(buckets_[bucket].load(std::memory_order_relaxed), + std::memory_order_relaxed); + + if (id < FAST_CAPACITY) { + fast_[id].store(&new_node->str, std::memory_order_release); + } + + buckets_[bucket].store(new_node, std::memory_order_release); + + // Advance num_strings_ past the highest id ever inserted so future + // get_or_insert calls don't collide with a loaded id. + std::size_t cur = num_strings_.load(std::memory_order_relaxed); + const std::size_t need = static_cast(id) + 1; + while (cur < need && !num_strings_.compare_exchange_weak( + cur, need, std::memory_order_release, + std::memory_order_relaxed)) { } - return it->second; } - /** - * @brief Resolve an ID back to its string. Thread-safe. - */ std::string_view resolve(std::uint32_t id) const { - std::shared_lock lock(mutex_); - return id_to_str_[id]; + if (id >= FAST_CAPACITY) return {}; + auto* p = fast_[id].load(std::memory_order_acquire); + return p ? std::string_view(*p) : std::string_view{}; } - /** - * @brief Intern a string and return a stable string_view. - * Convenience wrapper: inserts if new, then resolves to string_view. - */ std::string_view intern(std::string_view sv) { return resolve(get_or_insert(sv)); } - /** - * @brief Number of unique strings interned. - */ std::size_t size() const { - std::shared_lock lock(mutex_); - return id_to_str_.size(); + return num_strings_.load(std::memory_order_acquire); + } + + /// Shift the next-to-assign id counter to `base`. Subsequent + /// `get_or_insert` calls allocate ids starting at `base`. + /// Must be called before any `get_or_insert` on this instance. + /// Lock-free: caller ensures no concurrent inserts. + void reserve_id_base(std::uint32_t base) noexcept { + num_strings_.store(base, std::memory_order_release); + } + + /// Enable deterministic-hash id assignment. When set, `get_or_insert` + /// returns a stable id derived from the string's content rather than a + /// sequential counter. Same string -> same id in every process, + /// regardless of insertion order. Intended for multi-process workflows + /// (e.g. MPI ranks) where keys that include string ids must be + /// identical across ranks so RocksDB merge operators can combine + /// operands for the same logical key. + /// + /// Collision handling: the 32-bit id is `hash(str) & 0x7FFFFFFF` to + /// stay within FAST_CAPACITY-reachable range on lookup when + /// `id < FAST_CAPACITY`. Different strings with the same id are + /// chained in the bucket and lookup resolves by string equality, but + /// `resolve(id)` can only return one of them. For the typical + /// dftracer workload (cat/name/hhash/fhash dictionaries with O(1000) + /// entries) birthday collisions are negligible. + /// + /// Must be called before any `get_or_insert`. + void enable_deterministic_ids() noexcept { + deterministic_ids_.store(true, std::memory_order_release); } private: - mutable std::shared_mutex mutex_; - std::unordered_map - str_to_id_; - // Stores references to the map's owned strings for O(1) resolve. - std::vector id_to_str_; + static constexpr std::size_t BUCKET_COUNT = 1u << 12; // 4096 + static constexpr std::size_t BUCKET_MASK = BUCKET_COUNT - 1; + + struct Node { + const std::string str; + const std::size_t hash; + const std::uint32_t id; + std::atomic next; + }; + + static std::size_t hash(std::string_view sv) { + return std::hash{}(sv); + } + + std::unique_ptr[]> buckets_; + std::unique_ptr[]> fast_; + std::atomic num_strings_{0}; + std::atomic deterministic_ids_{false}; + std::mutex insert_mutex_; }; } // namespace dftracer::utils diff --git a/include/dftracer/utils/core/common/transparent_string_hash.h b/include/dftracer/utils/core/common/transparent_string_hash.h index eef6c77c..d45eab62 100644 --- a/include/dftracer/utils/core/common/transparent_string_hash.h +++ b/include/dftracer/utils/core/common/transparent_string_hash.h @@ -1,27 +1,25 @@ #ifndef DFTRACER_UTILS_CORE_COMMON_TRANSPARENT_STRING_HASH_H #define DFTRACER_UTILS_CORE_COMMON_TRANSPARENT_STRING_HASH_H +#include + #include -#include +#include #include namespace dftracer::utils { -/** - * @brief Transparent hash for std::unordered_map that - * accepts std::string_view lookups without constructing std::string. - * - * Usage: - * @code - * std::unordered_map map; - * map[some_string_view]; // no std::string construction for lookup - * @endcode - */ struct TransparentStringHash { using is_transparent = void; + using is_avalanching = void; std::size_t operator()(std::string_view sv) const noexcept { - return std::hash{}(sv); + return ankerl::unordered_dense::hash{}(sv); + } + std::size_t operator()(const std::string& s) const noexcept { + return ankerl::unordered_dense::hash{}(s); + } + std::size_t operator()(const char* s) const noexcept { + return ankerl::unordered_dense::hash{}(s); } }; @@ -32,6 +30,15 @@ struct TransparentStringEqual { } }; +template +using StringViewMap = + ankerl::unordered_dense::map; + +using StringViewSet = + ankerl::unordered_dense::set; + } // namespace dftracer::utils #endif // DFTRACER_UTILS_CORE_COMMON_TRANSPARENT_STRING_HASH_H diff --git a/include/dftracer/utils/core/coro/channel.h b/include/dftracer/utils/core/coro/channel.h index 8eeb52a6..5f541891 100644 --- a/include/dftracer/utils/core/coro/channel.h +++ b/include/dftracer/utils/core/coro/channel.h @@ -23,6 +23,9 @@ namespace dftracer::utils::coro { template class ChannelProducer; +template +class ChannelConsumer; + /** * Channel - Producer-consumer queue for streaming data * @@ -850,8 +853,24 @@ class Channel : public std::enable_shared_from_this> { return ChannelProducer(this); } + /** + * Get a receive-side handle for capturing into a coroutine lambda: + * + * [ch = channel->consumer()](...) -> CoroTask<...> { + * while (auto item = co_await ch.receive()) { ... } + * } + */ + ChannelConsumer consumer() { + auto sp = this->weak_from_this().lock(); + if (sp) { + return ChannelConsumer(std::move(sp)); + } + return ChannelConsumer(this); + } + private: friend class ChannelProducer; + friend class ChannelConsumer; ProducerGuard adopt_producer() { return ProducerGuard(this, typename ProducerGuard::Adopt{}); @@ -977,6 +996,35 @@ class Channel : public std::enable_shared_from_this> { ReceiveAwaitable receive() { return ReceiveAwaitable(this); } + std::optional blocking_receive() { + T item; + if (queue_.try_dequeue(item)) { + mark_item_consumed(); + if (!wake_one_send_waiter_after_receive()) { + cv_writable_.notify_one(); + } + maybe_notify_terminal(); + return std::optional(std::move(item)); + } + + std::unique_lock lock(state_mutex_); + while (true) { + if (try_receive_locked(item)) { + release_slot_if_bounded_locked(); + lock.unlock(); + if (!wake_one_send_waiter_after_receive()) { + cv_writable_.notify_one(); + } + maybe_notify_terminal(); + return std::optional(std::move(item)); + } + if (is_terminal_locked()) { + return std::nullopt; + } + cv_readable_.wait(lock); + } + } + SendAwaitable send(const T& item) { return SendAwaitable(this, item); } SendAwaitable send(T&& item) { @@ -1132,6 +1180,69 @@ class ChannelProducer { auto send(T&& item) { return raw_->send(std::move(item)); } }; +/** + * ChannelConsumer - Receive-side handle for Channel + * + * Holds a raw pointer for operations and optionally a shared_ptr to keep + * the channel alive when created from a shared_ptr channel. + * + * Usage: + * @code + * [ch = channel->consumer()](CoroScope& ctx) + * -> CoroTask { + * while (auto item = co_await ch.receive()) { + * process(*item); + * } + * } + * @endcode + */ +template +class ChannelConsumer { + Channel* raw_{nullptr}; + std::shared_ptr> shared_; + + public: + explicit ChannelConsumer(Channel* ch) : raw_(ch) {} + + explicit ChannelConsumer(std::shared_ptr> ch) + : raw_(ch.get()), shared_(std::move(ch)) {} + + ~ChannelConsumer() = default; + + ChannelConsumer(ChannelConsumer&& other) noexcept + : raw_(other.raw_), shared_(std::move(other.shared_)) { + other.raw_ = nullptr; + } + + ChannelConsumer& operator=(ChannelConsumer&& other) noexcept { + if (this != &other) { + raw_ = other.raw_; + shared_ = std::move(other.shared_); + other.raw_ = nullptr; + } + return *this; + } + + ChannelConsumer(const ChannelConsumer& other) + : raw_(other.raw_), shared_(other.shared_) {} + + ChannelConsumer& operator=(const ChannelConsumer& other) { + if (this != &other) { + raw_ = other.raw_; + shared_ = other.shared_; + } + return *this; + } + + auto receive() const { return raw_->receive(); } + + std::optional blocking_receive() const { + return raw_->blocking_receive(); + } + + bool is_closed() const { return raw_->is_closed(); } +}; + /** * Helper to create shared_ptr Channel */ diff --git a/include/dftracer/utils/core/pipeline/executor.h b/include/dftracer/utils/core/pipeline/executor.h index f2daf5e2..38eb568e 100644 --- a/include/dftracer/utils/core/pipeline/executor.h +++ b/include/dftracer/utils/core/pipeline/executor.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -25,10 +24,6 @@ #include #include -namespace dftracer::utils::io { -class IoThreadPool; -} // namespace dftracer::utils::io - namespace dftracer::utils { class Task; @@ -36,13 +31,12 @@ class CoroScope; class Scheduler; struct ExecutorConfig { - std::size_t num_threads = 0; // 0 = hardware_concurrency + std::size_t num_threads = 0; // 0 = hardware_concurrency std::chrono::seconds idle_timeout{5}; std::chrono::seconds deadlock_timeout{10}; - std::size_t io_pool_size = 4; + std::size_t io_pool_size = 0; // 0 = hardware_concurrency io::IoBackendType io_backend_type = io::IoBackendType::AUTO; unsigned io_batch_threshold = 16; - std::size_t db_pool_size = 2; }; /** @@ -154,11 +148,6 @@ class Executor { // Aligned to avoid false sharing between adjacent workers. struct alignas(DFTRACER_OPTIMAL_ALIGNMENT) WorkerContext { std::size_t worker_id; - // queue_mutex + cv: used for worker sleep/wake protocol. - // Workers sleep on cv; wake_one_worker/wake_all_workers - // lock+unlock this mutex before notifying to prevent lost wakeups. - mutable std::mutex queue_mutex; - std::condition_variable cv; // Health monitoring for watchdog std::atomic is_idle{false}; @@ -168,7 +157,7 @@ class Executor { // Current task info (for debugging/watchdog) std::atomic current_task_id{-1}; std::string current_task_name; - std::mutex task_name_mutex; // Protects current_task_name + std::mutex task_name_mutex; // Worker thread std::thread thread; @@ -197,8 +186,7 @@ class Executor { alignas(DFTRACER_OPTIMAL_ALIGNMENT) std::atomic total_tasks_submitted_{0}; - std::chrono::steady_clock::time_point last_activity_time_; - mutable std::mutex activity_mutex_; + std::atomic last_activity_ns_; // Shutdown coordination std::atomic shutdown_requested_{false}; @@ -229,14 +217,10 @@ class Executor { // I/O backend (owned by executor, created by factory) std::unique_ptr io_backend_; - // Dedicated thread pool for blocking DB operations. - std::unique_ptr db_pool_; - // Configuration (stored from ExecutorConfig) - std::size_t io_pool_size_ = 4; + std::size_t io_pool_size_ = 0; io::IoBackendType io_backend_type_ = io::IoBackendType::AUTO; unsigned io_batch_threshold_ = 16; - std::size_t db_pool_size_ = 2; public: /** @@ -294,6 +278,8 @@ class Executor { */ std::size_t get_num_threads() const { return num_threads_; } + std::size_t get_io_pool_size() const { return io_pool_size_; } + /** * Check if an I/O backend is available */ @@ -305,11 +291,6 @@ class Executor { io::IoBackend& io_backend() { return *io_backend_; } const io::IoBackend& io_backend() const { return *io_backend_; } - /** - * Get the dedicated DB thread pool (nullptr if not started). - */ - io::IoThreadPool* db_pool() noexcept; - /** * Get the executor running on the current worker thread (nullptr * if the calling thread is not a worker). Thread-local. diff --git a/include/dftracer/utils/core/pipeline/pipeline_config.h b/include/dftracer/utils/core/pipeline/pipeline_config.h index 5f727620..76ae3708 100644 --- a/include/dftracer/utils/core/pipeline/pipeline_config.h +++ b/include/dftracer/utils/core/pipeline/pipeline_config.h @@ -69,11 +69,10 @@ struct PipelineConfig { 600}; // Executor deadlock timeout (10 minutes) std::chrono::microseconds timeslice_duration{ 10'000}; // Coroutine yield timeslice (10ms, 0 = disabled) - std::size_t io_thread_count = 4; // I/O thread pool size + std::size_t io_thread_count = 0; // 0 = hardware_concurrency io::IoBackendType io_backend_type = io::IoBackendType::AUTO; // Backend selection unsigned io_batch_threshold = 16; // SQE batch threshold (0 = per-op) - std::size_t db_pool_size = 2; // Blocking DB async thread pool size /** * Set pipeline name @@ -200,14 +199,6 @@ struct PipelineConfig { return *this; } - /** - * Set blocking DB async thread pool size (default 2) - */ - PipelineConfig& with_db_pool_size(std::size_t size) { - db_pool_size = size; - return *this; - } - /** * Create sequential execution configuration (1 thread) */ diff --git a/include/dftracer/utils/core/rocksdb/async.h b/include/dftracer/utils/core/rocksdb/async.h deleted file mode 100644 index 3ff71a07..00000000 --- a/include/dftracer/utils/core/rocksdb/async.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef DFTRACER_UTILS_CORE_ROCKSDB_ASYNC_H -#define DFTRACER_UTILS_CORE_ROCKSDB_ASYNC_H - -#include -#include -#include -#include -#include - -namespace dftracer::utils::io { -class IoThreadPool; -} // namespace dftracer::utils::io - -namespace dftracer::utils::rocksdb { - -io::IoThreadPool* get_db_pool(); -void db_async_submit(io::IoThreadPool* pool, std::function fn); -void db_async_resume_on(void* executor, std::coroutine_handle<> h); -void* get_current_executor_opaque(); - -template -class DbAwaitable { - io::IoThreadPool* pool_; - void* executor_; - std::function fn_; - std::optional result_; - std::exception_ptr error_; - std::coroutine_handle<> handle_; - - public: - DbAwaitable(io::IoThreadPool* pool, void* executor, std::function fn) - : pool_(pool), executor_(executor), fn_(std::move(fn)) {} - - bool await_ready() noexcept { - if (pool_ == nullptr) { - try { - auto fn = std::move(fn_); - fn_ = {}; - result_.emplace(fn()); - } catch (...) { - error_ = std::current_exception(); - } - return true; - } - return false; - } - - void await_suspend(std::coroutine_handle<> h) { - handle_ = h; - auto* self = this; - db_async_submit(pool_, [self] { - try { - auto fn = std::move(self->fn_); - self->fn_ = {}; - self->result_.emplace(fn()); - } catch (...) { - self->error_ = std::current_exception(); - } - db_async_resume_on(self->executor_, self->handle_); - }); - } - - T await_resume() { - if (error_ != nullptr) { - std::rethrow_exception(error_); - } - return std::move(*result_); - } -}; - -template <> -class DbAwaitable { - io::IoThreadPool* pool_; - void* executor_; - std::function fn_; - std::exception_ptr error_; - std::coroutine_handle<> handle_; - - public: - DbAwaitable(io::IoThreadPool* pool, void* executor, - std::function fn) - : pool_(pool), executor_(executor), fn_(std::move(fn)) {} - - bool await_ready() noexcept { - if (pool_ == nullptr) { - try { - auto fn = std::move(fn_); - fn_ = {}; - fn(); - } catch (...) { - error_ = std::current_exception(); - } - return true; - } - return false; - } - - void await_suspend(std::coroutine_handle<> h) { - handle_ = h; - auto* self = this; - db_async_submit(pool_, [self] { - try { - auto fn = std::move(self->fn_); - self->fn_ = {}; - fn(); - } catch (...) { - self->error_ = std::current_exception(); - } - db_async_resume_on(self->executor_, self->handle_); - }); - } - - void await_resume() { - if (error_ != nullptr) { - std::rethrow_exception(error_); - } - } -}; - -template -auto run(F&& fn) -> DbAwaitable { - using R = decltype(fn()); - auto* pool = get_db_pool(); - auto* executor = get_current_executor_opaque(); - return DbAwaitable(pool, executor, std::forward(fn)); -} - -} // namespace dftracer::utils::rocksdb - -#endif // DFTRACER_UTILS_CORE_ROCKSDB_ASYNC_H diff --git a/include/dftracer/utils/core/rocksdb/column_families.h b/include/dftracer/utils/core/rocksdb/column_families.h new file mode 100644 index 00000000..c0b9c0ab --- /dev/null +++ b/include/dftracer/utils/core/rocksdb/column_families.h @@ -0,0 +1,65 @@ +#ifndef DFTRACER_UTILS_CORE_ROCKSDB_COLUMN_FAMILIES_H +#define DFTRACER_UTILS_CORE_ROCKSDB_COLUMN_FAMILIES_H + +#include +#include + +namespace dftracer::utils::rocksdb::cf { + +inline constexpr std::string_view DEFAULT = "default"; +inline constexpr std::string_view CHECKPOINTS = "checkpoints"; +inline constexpr std::string_view METADATA = "metadata"; +inline constexpr std::string_view CHUNK_BLOOM = "chunk_bloom"; +inline constexpr std::string_view FILE_BLOOM = "file_bloom"; +inline constexpr std::string_view CHUNK_STATS = "chunk_stats"; +inline constexpr std::string_view DIMENSIONS = "dimensions"; +inline constexpr std::string_view CHUNK_DIM_STATS = "chunk_dim_stats"; +inline constexpr std::string_view FILE_SCALAR_STATS = "file_scalar_stats"; +inline constexpr std::string_view FILE_CAT_COUNTS = "file_cat_counts"; +inline constexpr std::string_view FILE_NAME_COUNTS = "file_name_counts"; +inline constexpr std::string_view FILE_PID_TID_COUNTS = "file_pid_tid_counts"; +inline constexpr std::string_view ROOT_SCALAR_STATS = "root_scalar_stats"; +inline constexpr std::string_view ROOT_CAT_COUNTS = "root_cat_counts"; +inline constexpr std::string_view ROOT_NAME_COUNTS = "root_name_counts"; +inline constexpr std::string_view ROOT_PID_TID_COUNTS = "root_pid_tid_counts"; +inline constexpr std::string_view NAME_DICTIONARY = "name_dictionary"; +inline constexpr std::string_view NAME_FILE_POSTINGS = "name_file_postings"; +inline constexpr std::string_view NAME_CHUNK_POSTINGS = "name_chunk_postings"; +inline constexpr std::string_view MANIFEST = "manifest"; +inline constexpr std::string_view PROVENANCE = "provenance"; +inline constexpr std::string_view ARCHIVES = "archives"; +inline constexpr std::string_view TAR_FILES = "tar_files"; +inline constexpr std::string_view AGGREGATION = "aggregation"; +inline constexpr std::string_view SYSTEM_METRICS = "system_metrics"; +inline constexpr std::string_view HASH_TABLES = "hash_tables"; +inline constexpr auto ALL = + std::to_array({DEFAULT, + CHECKPOINTS, + METADATA, + CHUNK_BLOOM, + FILE_BLOOM, + CHUNK_STATS, + DIMENSIONS, + CHUNK_DIM_STATS, + FILE_SCALAR_STATS, + FILE_CAT_COUNTS, + FILE_NAME_COUNTS, + FILE_PID_TID_COUNTS, + ROOT_SCALAR_STATS, + ROOT_CAT_COUNTS, + ROOT_NAME_COUNTS, + ROOT_PID_TID_COUNTS, + NAME_DICTIONARY, + NAME_FILE_POSTINGS, + NAME_CHUNK_POSTINGS, + MANIFEST, + PROVENANCE, + ARCHIVES, + TAR_FILES, + AGGREGATION, + SYSTEM_METRICS, + HASH_TABLES}); + +} // namespace dftracer::utils::rocksdb::cf + +#endif // DFTRACER_UTILS_CORE_ROCKSDB_COLUMN_FAMILIES_H diff --git a/include/dftracer/utils/core/rocksdb/database.h b/include/dftracer/utils/core/rocksdb/database.h index e4d70216..ed004959 100644 --- a/include/dftracer/utils/core/rocksdb/database.h +++ b/include/dftracer/utils/core/rocksdb/database.h @@ -1,12 +1,15 @@ #ifndef DFTRACER_UTILS_CORE_ROCKSDB_DATABASE_H #define DFTRACER_UTILS_CORE_ROCKSDB_DATABASE_H +#include #include #include #include +#include #include #include +#include #include #include #include @@ -43,24 +46,47 @@ class RocksDatabase { ::rocksdb::DB* get() const noexcept; ::rocksdb::Status put(std::string_view key, std::string_view value, - std::string_view column_family = "default"); + std::string_view column_family = cf::DEFAULT); ::rocksdb::Status get(std::string_view key, std::string* value, - std::string_view column_family = "default") const; + std::string_view column_family = cf::DEFAULT) const; ::rocksdb::Status del(std::string_view key, - std::string_view column_family = "default"); + std::string_view column_family = cf::DEFAULT); + ::rocksdb::Status delete_range( + std::string_view begin_key, std::string_view end_key, + std::string_view column_family = cf::DEFAULT); ::rocksdb::Status put(Batch& batch, std::string_view column_family, std::string_view key, std::string_view value); ::rocksdb::Status del(Batch& batch, std::string_view column_family, std::string_view key); + ::rocksdb::Status merge(std::string_view key, std::string_view value, + std::string_view column_family = cf::DEFAULT); + ::rocksdb::Status merge(Batch& batch, std::string_view column_family, + std::string_view key, std::string_view value); + Batch begin_batch() const; ::rocksdb::Status commit_batch(Batch& batch); std::unique_ptr<::rocksdb::Iterator> new_iterator( - std::string_view column_family = "default") const; + std::string_view column_family = cf::DEFAULT) const; + + ::rocksdb::Status compact(std::string_view column_family = cf::DEFAULT); + + /// Bulk-ingest externally built SST files into the named column family. + /// Keys across the SSTs must be sorted and non-overlapping unless the + /// caller requests `ingest_behind`, which pushes entries to the bottom + /// level and silently drops duplicate keys (for content-addressed CFs). + ::rocksdb::Status ingest_external_files( + std::string_view column_family, + const std::vector& external_files, + bool ingest_behind = false); + + using CfOptionsOverride = std::function; + void set_cf_options_override(CfOptionsOverride override); - static std::vector default_column_families(); + static const decltype(cf::ALL)& default_column_families(); static ::rocksdb::Options default_options(); static ::rocksdb::ColumnFamilyOptions default_column_family_options(); @@ -75,6 +101,7 @@ class RocksDatabase { ::rocksdb::DB* db_ = nullptr; std::unordered_map column_families_; + CfOptionsOverride cf_options_override_; }; } // namespace dftracer::utils::rocksdb diff --git a/include/dftracer/utils/core/rocksdb/db_manager.h b/include/dftracer/utils/core/rocksdb/db_manager.h index c45eec95..9b0a6abf 100644 --- a/include/dftracer/utils/core/rocksdb/db_manager.h +++ b/include/dftracer/utils/core/rocksdb/db_manager.h @@ -22,7 +22,8 @@ class RocksDBManager { std::shared_ptr get_or_open( const std::string& db_path, - RocksDatabase::OpenMode open_mode = RocksDatabase::OpenMode::ReadWrite); + RocksDatabase::OpenMode open_mode = RocksDatabase::OpenMode::ReadWrite, + RocksDatabase::CfOptionsOverride cf_override = nullptr); void reset(const std::string& db_path); void shutdown(); diff --git a/include/dftracer/utils/core/runtime.h b/include/dftracer/utils/core/runtime.h index 0e617a93..790a1974 100644 --- a/include/dftracer/utils/core/runtime.h +++ b/include/dftracer/utils/core/runtime.h @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include @@ -19,6 +21,23 @@ namespace dftracer::utils { +namespace detail { + +template +coro::CoroTask run_scoped_utility(CoroScope& scope, UtilityT* utility, + InputT input) { + utility->set_context(scope); + try { + co_await utility->process(std::move(input)); + utility->clear_context(); + } catch (...) { + utility->clear_context(); + throw; + } +} + +} // namespace detail + /// Lightweight wrapper around Executor + Watchdog for running coroutines /// on a thread pool without Pipeline/Scheduler/DAG overhead. /// Intended for Python bindings and other non-DAG consumers. @@ -41,6 +60,44 @@ class Runtime { template TypedTaskHandle submit(coro::CoroTask task, std::string name = ""); + /// Submit a scoped task (provides CoroScope to the lambda). + /// Returns immediately, task runs on executor. + /// + /// Usage: + /// @code + /// auto handle = rt->scope("my_task", [](CoroScope& scope) -> + /// CoroTask { + /// scope.spawn([](CoroScope& s) -> CoroTask { co_return; }); + /// co_await scope.join(); + /// }); + /// handle.get(); // wait when needed + /// @endcode + template + requires std::is_invocable_r_v, Func, CoroScope&> + TaskHandle scope(std::string name, Func&& func) { + return submit(run_coro_scope(executor_.get(), std::forward(func)), + std::move(name)); + } + + /// Submit a NeedsContext utility with automatic context injection. + /// + /// Usage: + /// @code + /// AggregatorUtility util; + /// rt->scope("aggregator", util, input).get(); + /// @endcode + template > + requires utilities::has_tag_v + TaskHandle scope(std::string name, UtilityT& utility, InputT input) { + return submit( + run_coro_scope(executor_.get(), + detail::run_scoped_utility, + &utility, std::move(input)), + std::move(name)); + } + /// Wait for all outstanding tasks to complete. void wait_all(); @@ -52,6 +109,7 @@ class Runtime { void shutdown(); std::size_t threads() const; + std::size_t io_threads() const; Executor* executor() { return executor_.get(); } Watchdog* watchdog() { return watchdog_.get(); } @@ -107,6 +165,14 @@ TypedTaskHandle Runtime::submit(coro::CoroTask task, std::string name) { vp->set_value(); }; + // Set the executor on the task's promise so awaitables (e.g. channels) + // that capture `get_root_promise()->get_executor()` can schedule + // resumption. Without this, awaiters end up with executor=nullptr because + // the wrapping `coro::Coro` doesn't extend PromiseBase and the + // root-promise chain stops at the user's CoroTask. + if (task.handle()) { + task.handle().promise().set_executor(executor_.get()); + } auto coro = wrapper(std::move(task), typed_promise, void_promise, executor_.get(), tid); TaskIndex id = executor_->enqueue_tracked(std::move(coro), name, tid); diff --git a/include/dftracer/utils/core/tasks/coro_scope.h b/include/dftracer/utils/core/tasks/coro_scope.h index ddb095da..adc50b51 100644 --- a/include/dftracer/utils/core/tasks/coro_scope.h +++ b/include/dftracer/utils/core/tasks/coro_scope.h @@ -10,6 +10,9 @@ #include #include #include +#include +#include +#include #include #include @@ -246,6 +249,32 @@ class CoroScope { return coro::SpawnFuture(std::move(state)); } + template , + typename R = typename DecayedUtility::Output, + std::enable_if_t< + utilities::detail::has_process_v, + int> = 0> + coro::SpawnFuture spawn(UtilityT& utility, InputT input) { + return spawn([utility_ptr = &utility, input = std::move(input)]( + CoroScope& child_scope) mutable -> coro::CoroTask { + if constexpr (utilities::has_tag_v) { + utility_ptr->set_context(child_scope); + try { + R result = co_await utility_ptr->process(input); + utility_ptr->clear_context(); + co_return result; + } catch (...) { + utility_ptr->clear_context(); + throw; + } + } else { + co_return co_await utility_ptr->process(input); + } + }); + } + // ==================================================================== // Channel Operations // ==================================================================== @@ -395,9 +424,9 @@ class CoroScope { void spawn_consumers(std::shared_ptr> channel, std::size_t count, Func&& consumer_func) { for (std::size_t i = 0; i < count; i++) { - spawn([channel, func = consumer_func]( + spawn([ch = channel->consumer(), func = consumer_func]( CoroScope& scope) -> coro::CoroTask { - while (auto item = co_await channel->receive()) { + while (auto item = co_await ch.receive()) { co_await func(scope, std::move(*item)); } co_return; @@ -530,14 +559,15 @@ class CoroScope { // Creates a child CoroScope, runs the lambda, and auto-joins. // ======================================================================== -template - requires std::is_invocable_r_v, Func, CoroScope&> -inline coro::CoroTask run_coro_scope(Executor* executor, - Func scope_func) { +template + requires std::is_invocable_r_v, Func, CoroScope&, + Args...> +inline coro::CoroTask run_coro_scope(Executor* executor, Func scope_func, + Args... args) { CoroScope scope(executor); std::exception_ptr error; try { - co_await scope_func(scope); + co_await scope_func(scope, std::move(args)...); } catch (...) { error = std::current_exception(); } diff --git a/include/dftracer/utils/core/utilities/streaming_utility.h b/include/dftracer/utils/core/utilities/streaming_utility.h index 17f46f07..188a37d8 100644 --- a/include/dftracer/utils/core/utilities/streaming_utility.h +++ b/include/dftracer/utils/core/utilities/streaming_utility.h @@ -2,7 +2,9 @@ #define DFTRACER_UTILS_CORE_UTILITIES_STREAMING_UTILITY_H #include +#include #include +#include namespace dftracer::utils::utilities { @@ -46,6 +48,25 @@ class StreamingUtility : public UtilityBase { static constexpr std::string_view get_name() { return sig_; } virtual coro::AsyncGenerator process(const I& input) = 0; + + /// Bind context for streaming utilities with NeedsContext tag. + /// Unlike Utility::process which is wrapped by CoroScope::spawn, + /// streaming utilities need explicit context binding since their + /// AsyncGenerator cannot be spawned directly. + void bind_context(CoroScope& ctx) { + static_assert( + has_tag_v>, + "bind_context requires NeedsContext tag"); + this->set_context(ctx); + } + + /// Unbind context after streaming completes. + void unbind_context() { + static_assert( + has_tag_v>, + "unbind_context requires NeedsContext tag"); + this->clear_context(); + } }; } // namespace dftracer::utils::utilities diff --git a/include/dftracer/utils/core/utilities/utility.h b/include/dftracer/utils/core/utilities/utility.h index 127f14e0..7c8f332c 100644 --- a/include/dftracer/utils/core/utilities/utility.h +++ b/include/dftracer/utils/core/utilities/utility.h @@ -5,10 +5,12 @@ #include #include +#include #include #include #include #include +#include namespace dftracer::utils { class CoroScope; @@ -105,6 +107,8 @@ class UtilityBase { static constexpr std::string_view get_name() { return sig_; } protected: + bool has_context() const noexcept { return ctx_ != nullptr; } + /** * @brief Access CoroScope (only valid when NeedsContext tag is present). */ @@ -124,6 +128,8 @@ class UtilityBase { void set_context(CoroScope& ctx) { ctx_ = &ctx; } void clear_context() { ctx_ = nullptr; } + + friend class ::dftracer::utils::CoroScope; }; /** @@ -157,6 +163,30 @@ class Utility : public UtilityBase { friend class behaviors::UtilityExecutor; virtual coro::CoroTask process(const I& input) = 0; + + // Rvalue overload picked automatically for braced-init / std::move / + // other prvalue call expressions. Moves the input into wrapper storage + // so the inner virtual receives a stable reference that outlives every + // internal suspension point. Lvalue call sites still bind to + // process(const I&) directly, so hot loops that reuse a named local + // pay zero overhead. + coro::CoroTask process(I&& input) { +#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 14) + // GCC 12/13 miscalculate frame offsets for non-trivial locals in + // coroutine frames (coroutine-caveats.md §3). Heap-allocate so only + // a trivial unique_ptr slot lives in the wrapper frame, isolating + // the input object from frame-layout corruption. Drop this branch + // once the GCC 12/13 baseline is retired. + auto owned = std::make_unique(std::move(input)); + co_return co_await this->process(static_cast(*owned)); +#else + // GCC 14+, Clang 14+, MSVC: frame-local is safe per the language + // rules, the local lives in the wrapper coroutine frame and the + // inner co_await holds a reference to it across suspension. + I local(std::move(input)); + co_return co_await this->process(static_cast(local)); +#endif + } }; } // namespace dftracer::utils::utilities diff --git a/include/dftracer/utils/server/trace_index.h b/include/dftracer/utils/server/trace_index.h index d131cd00..f89071b3 100644 --- a/include/dftracer/utils/server/trace_index.h +++ b/include/dftracer/utils/server/trace_index.h @@ -19,20 +19,11 @@ namespace dftracer::utils::server { /// paths and check index availability. class TraceIndex { public: - // Files below this compressed size are streamed directly without - // building a `.dftindex` database. At 8 MB compressed - // (~160 MB uncompressed with typical 20x JSON compression), a file - // has only a handful of 32 MB checkpoints -- the indexing overhead - // exceeds the benefit of bloom-filter skip. - static constexpr std::size_t INDEX_SIZE_THRESHOLD = - constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; - struct FileInfo { std::string path; std::string index_path; bool has_bloom_data = false; bool has_checkpoint_index = false; - bool is_small = false; std::uint64_t min_timestamp_us = 0; std::uint64_t max_timestamp_us = 0; std::uint64_t compressed_size = 0; diff --git a/include/dftracer/utils/utilities/common/arrow/arrow.h b/include/dftracer/utils/utilities/common/arrow/arrow.h index 3350baf8..fb7f15fc 100644 --- a/include/dftracer/utils/utilities/common/arrow/arrow.h +++ b/include/dftracer/utils/utilities/common/arrow/arrow.h @@ -5,7 +5,11 @@ #include #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +#include #include +#include +#include +#include #endif #endif // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_ARROW_H diff --git a/include/dftracer/utils/utilities/common/arrow/arrow_export.h b/include/dftracer/utils/utilities/common/arrow/arrow_export.h index d9631c4e..e8739984 100644 --- a/include/dftracer/utils/utilities/common/arrow/arrow_export.h +++ b/include/dftracer/utils/utilities/common/arrow/arrow_export.h @@ -1,6 +1,7 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_ARROW_EXPORT_H #define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_ARROW_EXPORT_H +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW #include diff --git a/include/dftracer/utils/utilities/common/arrow/column_builder.h b/include/dftracer/utils/utilities/common/arrow/column_builder.h index e29d8717..dd54f694 100644 --- a/include/dftracer/utils/utilities/common/arrow/column_builder.h +++ b/include/dftracer/utils/utilities/common/arrow/column_builder.h @@ -1,12 +1,15 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_COLUMN_BUILDER_H #define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_COLUMN_BUILDER_H +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW +#include #include #include #include +#include #include #include #include @@ -16,7 +19,7 @@ namespace dftracer::utils::utilities::common::arrow { -enum class ColumnType { INT64, UINT64, DOUBLE, STRING, BOOL }; +enum class ColumnType { INT64, UINT64, DOUBLE, STRING, BOOL, DICT_STRING }; struct ColumnSpec { std::string name; @@ -26,14 +29,21 @@ struct ColumnSpec { struct ColumnData { std::string name; ColumnType type; - std::vector int64_values; - std::vector uint64_values; + std::vector int64_values; + std::vector uint64_values; std::vector double_values; - std::vector string_values; - std::vector bool_values; - std::vector validity; // 1 = valid, 0 = null - size_t count = 0; + std::vector string_offsets; + std::vector string_data; + std::vector bool_values; + std::vector validity; + std::size_t count = 0; bool has_nulls = false; + + // Dictionary encoding support (for DICT_STRING) + std::vector dict_indices; // indices into dict_values + std::deque dict_values; // unique strings (dictionary) + std::unordered_map + dict_map; // string -> index }; /** @@ -44,8 +54,8 @@ struct ColumnData { * Dynamic: add_or_get_column() on first encounter; backfills nulls for * columns not touched in a given row via end_row(). * - * String columns store std::string_view — caller must keep source data - * alive until finish() returns. + * String columns copy and own their data — no lifetime requirements on + * the source strings passed to append_string(). * * NOT thread-safe. One builder per worker/coroutine. */ @@ -62,23 +72,24 @@ class RecordBatchBuilder { // Returns existing index if column already exists; type is ignored for // existing columns — callers must use find_column() to check type before // appending, and fall back to append_null() on mismatch. - size_t add_or_get_column(std::string_view name, ColumnType type); + std::size_t add_or_get_column(std::string_view name, ColumnType type); // Returns the index of an existing column, or std::nullopt if not found. // Use before appending null values to avoid creating STRING-typed columns // that may later receive typed values. - std::optional find_column(std::string_view name) const; + std::optional find_column(std::string_view name) const; // Returns the type of column at col_idx. - ColumnType column_type(size_t col_idx) const noexcept; + ColumnType column_type(std::size_t col_idx) const noexcept; // Append typed values by column index. - void append_int64(size_t col_idx, int64_t value); - void append_uint64(size_t col_idx, uint64_t value); - void append_double(size_t col_idx, double value); - void append_string(size_t col_idx, std::string_view value); - void append_bool(size_t col_idx, bool value); - void append_null(size_t col_idx); + void append_int64(std::size_t col_idx, std::int64_t value); + void append_uint64(std::size_t col_idx, std::uint64_t value); + void append_double(std::size_t col_idx, double value); + void append_string(std::size_t col_idx, std::string_view value); + void append_dict_string(std::size_t col_idx, std::string_view value); + void append_bool(std::size_t col_idx, bool value); + void append_null(std::size_t col_idx); // End current row. In dynamic mode, backfills nulls for untouched // columns. In static mode, validates all columns were appended. @@ -86,29 +97,40 @@ class RecordBatchBuilder { void end_row(); // Pre-allocate internal buffers for num_rows rows. - void reserve(size_t num_rows); + void reserve(std::size_t num_rows); // Bulk-convert internal vectors to Arrow and return a self-contained // result. Builder is in an undefined state until reset() is called. ArrowExportResult finish(); // Clear data. If keep_schema is true, column structure is preserved - // for the next batch (static mode only; dynamic mode always clears). + // for the next batch (requires schema to be locked first). void reset(bool keep_schema = true); - size_t num_rows() const noexcept { return num_rows_; } - size_t num_columns() const noexcept { return columns_.size(); } + // Lock the current schema. After locking: + // - Existing columns maintain their positions + // - New columns discovered via add_or_get_column() are appended at end + // - reset(true) preserves the schema structure + // Call after emitting the first batch to ensure consistent column ordering. + void lock_schema() noexcept { schema_locked_ = true; } + + // Check if schema is locked. + bool is_schema_locked() const noexcept { return schema_locked_; } + + std::size_t num_rows() const noexcept { return num_rows_; } + std::size_t num_columns() const noexcept { return columns_.size(); } private: std::vector columns_; - std::unordered_map name_to_index_; - size_t num_rows_ = 0; + StringViewMap name_to_index_; + std::size_t num_rows_ = 0; + std::size_t row_touched_count_ = 0; bool schema_declared_ = false; - // Tracks which columns were touched in the current row (dynamic mode). + bool schema_locked_ = false; std::vector touched_; void init_column(ColumnData& col, ColumnType type, std::string_view name); - void backfill_nulls(ColumnData& col, size_t target_count); + void backfill_nulls(ColumnData& col, std::size_t target_count); }; } // namespace dftracer::utils::utilities::common::arrow diff --git a/include/dftracer/utils/utilities/common/arrow/ipc_reader.h b/include/dftracer/utils/utilities/common/arrow/ipc_reader.h new file mode 100644 index 00000000..2d88326b --- /dev/null +++ b/include/dftracer/utils/utilities/common/arrow/ipc_reader.h @@ -0,0 +1,99 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_READER_H +#define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_READER_H + +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::arrow::detail { +// Store block info separately from decoder state +struct IpcBlock { + std::int64_t offset; + std::int32_t metadata_length; + std::int64_t body_length; +}; +} // namespace dftracer::utils::utilities::common::arrow::detail + +namespace dftracer::utils::utilities::common::arrow { + +/** + * RAII reader for Arrow IPC file format (.arrow). + * + * Optimized with: + * - Memory-mapped I/O for zero-copy file access + * - Shared schema (no deep copy per batch) + * - Buffer reuse for decompression + * + * Supports buffer-level ZSTD decompression compatible with + * pyarrow, polars, and this library's IpcWriter. + * + * Sequence: open() -> num_batches() -> read_batch(i) [or read_all()] + * + * Move-only. Not thread-safe. + */ +class IpcReader { + public: + IpcReader() = default; + ~IpcReader(); + + IpcReader(const IpcReader&) = delete; + IpcReader& operator=(const IpcReader&) = delete; + IpcReader(IpcReader&& other) noexcept; + IpcReader& operator=(IpcReader&& other) noexcept; + + // Open file for reading. Returns 0 on success. + int open(const std::string& path); + + // Close the file. + void close(); + + bool is_open() const noexcept { return mapped_data_ != nullptr; } + + // Number of record batches in the file. + std::size_t num_batches() const noexcept { return num_batches_; } + + // Total rows across all batches. + std::int64_t total_rows() const noexcept { return total_rows_; } + + // Read a single batch by index. Returns empty result on error. + ArrowExportResult read_batch(std::size_t index); + + // Read all batches and return as a vector. + std::vector read_all(); + + // Iterate over all batches, calling callback for each. + // Returns 0 on success, non-zero if callback returns non-zero or on error. + int for_each_batch(std::function callback); + + private: + // Memory-mapped file data + void* mapped_data_ = nullptr; + std::size_t mapped_size_ = 0; + int fd_ = -1; + + // Decoder state + void* decoder_ = nullptr; // ArrowIpcDecoder* + + // Shared schema (not deep-copied per batch) + std::shared_ptr shared_schema_; // ArrowSchema*, ref-counted + + // Block metadata + std::vector blocks_; + std::size_t num_batches_ = 0; + std::int64_t total_rows_ = 0; + + void reset_state() noexcept; + int read_footer(); +}; + +} // namespace dftracer::utils::utilities::common::arrow + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC +#endif // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_READER_H diff --git a/include/dftracer/utils/utilities/common/arrow/ipc_writer.h b/include/dftracer/utils/utilities/common/arrow/ipc_writer.h index 2e0dc572..41319e1a 100644 --- a/include/dftracer/utils/utilities/common/arrow/ipc_writer.h +++ b/include/dftracer/utils/utilities/common/arrow/ipc_writer.h @@ -1,21 +1,74 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_WRITER_H #define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_WRITER_H +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +#include #include +#include #include +#include #include +#include namespace dftracer::utils::utilities::common::arrow { /** - * RAII wrapper for writing Arrow IPC file format (.arrows). + * Compression type for Arrow IPC buffer-level compression. * - * Sequence: open() -> write_batch() [1..N] -> close() - * The first write_batch() call writes the schema; subsequent calls append - * record batches. close() finalizes the file footer. + * Buffer-level compression means each buffer in a record batch is compressed + * independently. This is the standard Arrow IPC compression format and is + * readable by pyarrow, polars, and other Arrow implementations. + */ +enum class IpcCompression { + NONE, // Uncompressed (maximum compatibility) +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + ZSTD, // zstd compression (best ratio/speed) +#endif +}; + +#ifdef DFTRACER_UTILS_ENABLE_ZSTD +constexpr IpcCompression DEFAULT_ARROW_IPC_COMPRESSION = IpcCompression::ZSTD; +#else +constexpr IpcCompression DEFAULT_ARROW_IPC_COMPRESSION = IpcCompression::NONE; +#endif + +class BufferPool { + public: + static constexpr std::size_t DEFAULT_BUFFER_CAPACITY = 4 * 1024 * 1024; + + struct Slot { + std::vector data; + std::atomic in_use{false}; + }; + + explicit BufferPool(std::size_t num_slots = 4, + std::size_t initial_capacity = DEFAULT_BUFFER_CAPACITY); + ~BufferPool() = default; + + BufferPool(const BufferPool&) = delete; + BufferPool& operator=(const BufferPool&) = delete; + BufferPool(BufferPool&&) = default; + BufferPool& operator=(BufferPool&&) = default; + + Slot* acquire(std::size_t min_capacity = 0); + void release(Slot* slot); + std::size_t size() const { return slots_.size(); } + + private: + std::vector> slots_; +}; + +/** + * Async Arrow IPC file writer (.arrow). + * + * Uses Executor::current() for async I/O - must be called from within executor. + * Supports buffer-level compression (zstd) compatible with pyarrow, polars, + * nanoarrow, and other Arrow IPC readers. + * + * Usage: open() -> write_batch() [1..N] -> close() * * Move-only. Not thread-safe. */ @@ -29,28 +82,40 @@ class IpcWriter { IpcWriter(IpcWriter&& other) noexcept; IpcWriter& operator=(IpcWriter&& other) noexcept; - // Open path for writing. Returns 0 on success. - int open(const std::string& path); - - // Write one record batch. First call also writes the schema. - // Returns 0 on success. - int write_batch(ArrowExportResult& batch); + coro::CoroTask open( + const std::string& path, + IpcCompression compression = DEFAULT_ARROW_IPC_COMPRESSION, + std::size_t pool_slots = 4); - // Finalize footer and close. Returns 0 on success. - int close(); + coro::CoroTask write_batch(ArrowExportResult& batch); + coro::CoroTask write_batches(std::vector& batches); + coro::CoroTask close(); - bool is_open() const noexcept { return file_ != nullptr; } + bool is_open() const noexcept { return fd_ >= 0; } private: - std::FILE* file_ = nullptr; + int fd_ = -1; + off_t write_offset_ = 0; + BufferPool buffer_pool_; bool schema_written_ = false; - // Heap-allocated nanoarrow structs stored as void* to avoid pulling - // nanoarrow_ipc.h into every translation unit that includes this header. - void* writer_ = nullptr; // ArrowIpcWriter* - void* stream_ = - nullptr; // ArrowIpcOutputStream* (owned by writer_ after init) + IpcCompression compression_ = DEFAULT_ARROW_IPC_COMPRESSION; + void* batch_blocks_ = nullptr; + void* schema_copy_ = nullptr; void reset_state() noexcept; + + struct CompressedBatch { + std::vector header; + BufferPool::Slot* body_slot; + std::size_t body_size; + std::int32_t metadata_length; + std::int64_t body_length; + }; + + coro::CoroTask compress_batch(ArrowExportResult& batch); + coro::CoroTask write_compressed(CompressedBatch& cb); + coro::CoroTask write_schema(ArrowExportResult& batch); + coro::CoroTask write_footer(); }; } // namespace dftracer::utils::utilities::common::arrow diff --git a/include/dftracer/utils/utilities/common/arrow/parallel_reader.h b/include/dftracer/utils/utilities/common/arrow/parallel_reader.h new file mode 100644 index 00000000..356ef27b --- /dev/null +++ b/include/dftracer/utils/utilities/common/arrow/parallel_reader.h @@ -0,0 +1,95 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARALLEL_READER_H +#define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARALLEL_READER_H + +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils { +class CoroScope; +} + +namespace dftracer::utils::utilities::common::arrow { + +using dftracer::utils::CoroScope; + +/** + * Result from reading a single Arrow IPC file. + * Batches are stored in shared_ptr to allow copying through std::shared_future. + */ +struct ArrowFileReadResult { + std::string path; + std::shared_ptr> batches; + std::int64_t total_rows = 0; + std::string error; + bool success = true; + + ArrowFileReadResult() + : batches(std::make_shared>()) {} +}; + +/** + * Result from reading multiple Arrow IPC files in parallel. + */ +struct ParallelReadResult { + std::vector file_results; + std::int64_t total_rows = 0; + std::int64_t total_batches = 0; + std::size_t files_read = 0; + std::size_t files_failed = 0; +}; + +/** + * Read a single Arrow IPC file as a coroutine. + * + * @param path Path to the Arrow IPC file. + * @return ArrowFileReadResult with batches or error. + */ +coro::CoroTask read_arrow_file_async(std::string path); + +/** + * Read multiple Arrow IPC files in parallel. + * + * Collects all results before returning. For streaming results as they + * complete, use read_arrow_files_streaming instead. + * + * @param paths List of file paths to read. + * @return ParallelReadResult with all results. + */ +coro::CoroTask read_arrow_files_parallel( + std::vector paths); + +/** + * Callback type for streaming file results. + * Return false to cancel remaining reads. + */ +using FileResultCallback = std::function; + +/** + * Read multiple Arrow IPC files in parallel, streaming results via callback. + * + * Results are delivered in completion order (whichever file finishes first). + * This is more memory-efficient for large numbers of files. + * + * Must be run within a CoroScope (via runtime.scope() or run_coro_scope). + * + * @param scope CoroScope for spawning parallel tasks. + * @param paths List of file paths to read. + * @param callback Called for each file result. Return false to cancel. + * @return Summary stats (files_read, files_failed, total_rows, total_batches). + */ +coro::CoroTask read_arrow_files_streaming( + CoroScope& scope, std::vector paths, + FileResultCallback callback); + +} // namespace dftracer::utils::utilities::common::arrow + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC +#endif // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARALLEL_READER_H diff --git a/include/dftracer/utils/utilities/common/arrow/partition_router.h b/include/dftracer/utils/utilities/common/arrow/partition_router.h new file mode 100644 index 00000000..41f4352c --- /dev/null +++ b/include/dftracer/utils/utilities/common/arrow/partition_router.h @@ -0,0 +1,94 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_ROUTER_H +#define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_ROUTER_H + +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::arrow { + +struct RouterWriteStats { + std::unordered_map partitions; + int64_t total_rows = 0; + int64_t total_uncompressed_bytes = 0; +}; + +struct PartitionConfig { + enum class Mode { + NONE, + COLUMN, + BUCKETED, + VIEW, + }; + + Mode mode = Mode::NONE; + std::vector partition_columns; + int num_buckets = 0; + std::vector>> views; +}; + +using PredicateEvaluator = + std::function&)>; + +/** + * Routes Arrow record batches to partitioned output directories. + * Supports column-based, bucketed, and view-based partitioning. + */ +class PartitionRouter { + public: + PartitionRouter() = default; + ~PartitionRouter(); + + PartitionRouter(const PartitionRouter&) = delete; + PartitionRouter& operator=(const PartitionRouter&) = delete; + PartitionRouter(PartitionRouter&& other) noexcept; + PartitionRouter& operator=(PartitionRouter&& other) noexcept; + + int open(const std::string& output_dir, const PartitionConfig& config, + int64_t chunk_size_bytes, + IpcCompression compression = DEFAULT_ARROW_IPC_COMPRESSION); + + void register_predicate(const std::string& view_name, + PredicateEvaluator evaluator); + + coro::CoroTask write_batch(ArrowExportResult& batch); + coro::CoroTask close(); + + bool is_open() const noexcept { return is_open_; } + + private: + std::string output_dir_; + PartitionConfig config_; + int64_t chunk_size_bytes_ = 0; + IpcCompression compression_ = DEFAULT_ARROW_IPC_COMPRESSION; + bool is_open_ = false; + + std::unordered_map> writers_; + std::unordered_map predicates_; + + coro::CoroTask get_or_create_writer( + const std::string& partition_key); + std::string partition_path(const std::string& partition_key) const; + int compute_bucket(const std::vector& values) const; + + coro::CoroTask route_none(ArrowExportResult& batch); + coro::CoroTask route_column(ArrowExportResult& batch); + coro::CoroTask route_bucketed(ArrowExportResult& batch); + coro::CoroTask route_view(ArrowExportResult& batch); +}; + +} // namespace dftracer::utils::utilities::common::arrow + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC +#endif // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_ROUTER_H diff --git a/include/dftracer/utils/utilities/common/arrow/partition_writer.h b/include/dftracer/utils/utilities/common/arrow/partition_writer.h new file mode 100644 index 00000000..4c3cfe50 --- /dev/null +++ b/include/dftracer/utils/utilities/common/arrow/partition_writer.h @@ -0,0 +1,76 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_WRITER_H +#define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_WRITER_H + +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#include +#include +#include + +#include +#include +#include + +namespace dftracer::utils::utilities::common::arrow { + +struct PartitionWriteStats { + std::vector files; + std::vector row_counts; + int64_t total_rows = 0; + int64_t total_uncompressed_bytes = 0; +}; + +/** + * Async wrapper around IpcWriter with automatic file rotation. + * Writes part-NNNNN.arrow files, rotating when size threshold is exceeded. + */ +class PartitionWriter { + public: + PartitionWriter() = default; + ~PartitionWriter(); + + PartitionWriter(const PartitionWriter&) = delete; + PartitionWriter& operator=(const PartitionWriter&) = delete; + PartitionWriter(PartitionWriter&& other) noexcept; + PartitionWriter& operator=(PartitionWriter&& other) noexcept; + + coro::CoroTask open( + const std::string& output_dir, int64_t chunk_size_bytes, + IpcCompression compression = DEFAULT_ARROW_IPC_COMPRESSION); + + coro::CoroTask write_batch(ArrowExportResult& batch); + coro::CoroTask close(); + + bool is_open() const noexcept { return is_open_; } + int64_t current_file_bytes() const noexcept { return current_file_bytes_; } + int64_t total_bytes() const noexcept { return total_bytes_; } + int64_t total_rows() const noexcept { return total_rows_; } + size_t file_count() const noexcept { return file_index_; } + + private: + std::string output_dir_; + int64_t chunk_size_bytes_ = 0; + IpcCompression compression_ = DEFAULT_ARROW_IPC_COMPRESSION; + + IpcWriter writer_; + bool is_open_ = false; + size_t file_index_ = 0; + + int64_t current_file_bytes_ = 0; + int64_t current_file_rows_ = 0; + int64_t total_bytes_ = 0; + int64_t total_rows_ = 0; + + std::vector files_; + std::vector row_counts_; + + std::string generate_filename() const; + coro::CoroTask rotate_file(); + int64_t calculate_uncompressed_size(ArrowExportResult& batch); +}; + +} // namespace dftracer::utils::utilities::common::arrow + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC +#endif // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_WRITER_H diff --git a/include/dftracer/utils/utilities/common/json/json.h b/include/dftracer/utils/utilities/common/json/json.h index 1ece20b3..1dfa622b 100644 --- a/include/dftracer/utils/utilities/common/json/json.h +++ b/include/dftracer/utils/utilities/common/json/json.h @@ -5,7 +5,8 @@ * @file json.h * @brief Common JSON utilities for the dftracer-utils library. * - * Provides JsonValue - a lightweight zero-cost wrapper around yyjson_val*. + * Provides JsonValue - a lightweight zero-cost wrapper around simdjson DOM + * elements. */ #include @@ -14,10 +15,9 @@ namespace dftracer::utils::utilities::common::json { -/// Stack buffer size for yyjson_alc_pool used in per-line JSON parsing. -/// 4KB is sufficient for typical trace events (few hundred bytes each). -/// If a line exceeds this, yyjson silently falls back to malloc. -inline constexpr std::size_t YYJSON_LINE_POOL_SIZE = 4096; +/// Default capacity for simdjson parser buffer. +/// 1MB is sufficient for most JSON documents. +inline constexpr std::size_t SIMDJSON_DEFAULT_CAPACITY = 1 << 20; } // namespace dftracer::utils::utilities::common::json diff --git a/include/dftracer/utils/utilities/common/json/json_doc_guard.h b/include/dftracer/utils/utilities/common/json/json_doc_guard.h index 3debb463..2c5a0431 100644 --- a/include/dftracer/utils/utilities/common/json/json_doc_guard.h +++ b/include/dftracer/utils/utilities/common/json/json_doc_guard.h @@ -1,37 +1,60 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMMON_JSON_JSON_DOC_GUARD_H #define DFTRACER_UTILS_UTILITIES_COMMON_JSON_JSON_DOC_GUARD_H -#include +#include + +#include namespace dftracer::utils::utilities::common::json { -/// RAII guard for yyjson_doc to prevent leaks on exceptions or -/// early co_return from coroutines. +/// RAII guard that owns a simdjson DOM parser and document. +/// With simdjson, the parser manages document lifetime internally. struct JsonDocGuard { - yyjson_doc* doc = nullptr; + simdjson::dom::parser parser; + bool valid = false; - explicit JsonDocGuard(yyjson_doc* d) : doc(d) {} - ~JsonDocGuard() { - if (doc) yyjson_doc_free(doc); - } + JsonDocGuard() = default; - JsonDocGuard(const JsonDocGuard&) = delete; - JsonDocGuard& operator=(const JsonDocGuard&) = delete; - JsonDocGuard(JsonDocGuard&& other) noexcept : doc(other.doc) { - other.doc = nullptr; - } - JsonDocGuard& operator=(JsonDocGuard&& other) noexcept { - if (this != &other) { - if (doc) yyjson_doc_free(doc); - doc = other.doc; - other.doc = nullptr; - } - return *this; + bool parse(const char* data, std::size_t len) { + auto result = parser.parse(data, len); + valid = !result.error(); + return valid; } - explicit operator bool() const { return doc != nullptr; } + simdjson::dom::element root() const { return parser.doc.root(); } + + explicit operator bool() const { return valid; } }; +/// Convert an On-Demand value to string for bloom filter insertion. +/// Handles strings, integers, floats, bools. +inline std::string ondemand_value_to_string(simdjson::ondemand::value& val) { + auto type_result = val.type(); + if (type_result.error()) return {}; + + switch (type_result.value()) { + case simdjson::ondemand::json_type::string: { + auto s = val.get_string(); + return s.error() ? std::string{} : std::string(s.value()); + } + case simdjson::ondemand::json_type::number: { + auto u = val.get_uint64(); + if (!u.error()) return std::to_string(u.value()); + auto i = val.get_int64(); + if (!i.error()) return std::to_string(i.value()); + auto d = val.get_double(); + if (!d.error()) return std::to_string(d.value()); + return {}; + } + case simdjson::ondemand::json_type::boolean: { + auto b = val.get_bool(); + return b.error() ? std::string{} : (b.value() ? "true" : "false"); + } + default: + return {}; + } +} + } // namespace dftracer::utils::utilities::common::json #endif // DFTRACER_UTILS_UTILITIES_COMMON_JSON_JSON_DOC_GUARD_H diff --git a/include/dftracer/utils/utilities/common/json/json_value.h b/include/dftracer/utils/utilities/common/json/json_value.h index 8f6d5489..8f051c0a 100644 --- a/include/dftracer/utils/utilities/common/json/json_value.h +++ b/include/dftracer/utils/utilities/common/json/json_value.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include @@ -18,35 +18,45 @@ namespace dftracer::utils::utilities::common::json { /** - * Lightweight zero-cost wrapper around yyjson_val* with convenient accessors. + * Lightweight wrapper around simdjson::dom::element with convenient accessors. * - * Provides pure lazy evaluation: + * Provides: * - Fluent chaining: json["args"]["hhash"] * - Template get() with auto-casting * - Default values for missing/null fields - * - Zero overhead - just pointer navigation + * - Zero overhead - just element navigation * - * IMPORTANT: JsonValue is only valid while the yyjson_doc is alive. + * IMPORTANT: JsonValue is only valid while the simdjson::dom::document is + * alive. */ class JsonValue { private: - yyjson_val* val_; + simdjson::dom::element elem_; + bool valid_ = false; public: - explicit JsonValue(yyjson_val* val = nullptr) : val_(val) {} - - bool is_null() const { return !val_ || yyjson_is_null(val_); } - bool is_bool() const { return val_ && yyjson_is_bool(val_); } - bool is_string() const { return val_ && yyjson_is_str(val_); } - bool is_uint() const { return val_ && yyjson_is_uint(val_); } - bool is_int() const { return val_ && yyjson_is_int(val_); } - bool is_number() const { return val_ && yyjson_is_num(val_); } - bool is_object() const { return val_ && yyjson_is_obj(val_); } - bool is_array() const { return val_ && yyjson_is_arr(val_); } - bool exists() const { return val_ != nullptr; } + JsonValue() : valid_(false) {} + explicit JsonValue(simdjson::dom::element elem) + : elem_(elem), valid_(true) {} + + bool is_null() const { return !valid_ || elem_.is_null(); } + bool is_bool() const { return valid_ && elem_.is_bool(); } + bool is_string() const { return valid_ && elem_.is_string(); } + bool is_uint() const { return valid_ && elem_.is_uint64(); } + bool is_int() const { return valid_ && elem_.is_int64(); } + bool is_number() const { + return valid_ && + (elem_.is_int64() || elem_.is_uint64() || elem_.is_double()); + } + bool is_object() const { return valid_ && elem_.is_object(); } + bool is_array() const { return valid_ && elem_.is_array(); } + bool exists() const { return valid_; } JsonValue operator[](const char* key) const { - return JsonValue(val_ ? yyjson_obj_get(val_, key) : nullptr); + if (!valid_ || !elem_.is_object()) return JsonValue(); + auto result = elem_[key]; + if (result.error()) return JsonValue(); + return JsonValue(result.value_unsafe()); } JsonValue operator[](const std::string& key) const { @@ -54,8 +64,10 @@ class JsonValue { } JsonValue operator[](std::string_view key) const { - std::string key_str(key); - return (*this)[key_str.c_str()]; + if (!valid_ || !elem_.is_object()) return JsonValue(); + auto result = elem_[key]; + if (result.error()) return JsonValue(); + return JsonValue(result.value_unsafe()); } JsonValue at(const char* path) const; @@ -64,49 +76,43 @@ class JsonValue { template T get(const T& default_val = T{}) const { + if (!valid_) return default_val; + if constexpr (std::is_same_v) { - return val_ && yyjson_is_bool(val_) ? yyjson_get_bool(val_) - : default_val; + auto r = elem_.get_bool(); + return r.error() ? default_val : r.value_unsafe(); } else if constexpr (std::is_same_v) { - return (val_ && yyjson_is_str(val_)) - ? std::string(yyjson_get_str(val_)) - : default_val; + auto r = elem_.get_string(); + return r.error() ? default_val : std::string(r.value_unsafe()); } else if constexpr (std::is_same_v) { - if (val_ && yyjson_is_str(val_)) { - const char* str = yyjson_get_str(val_); - std::size_t len = yyjson_get_len(val_); - return std::string_view(str, len); - } - return default_val; + auto r = elem_.get_string(); + return r.error() ? default_val : r.value_unsafe(); } else if constexpr (std::is_same_v) { - return (val_ && yyjson_is_str(val_)) ? yyjson_get_str(val_) - : default_val; + auto r = elem_.get_c_str(); + return r.error() ? default_val : r.value_unsafe(); } else if constexpr (std::is_same_v) { - if (!val_) return default_val; - if (yyjson_is_uint(val_)) return yyjson_get_uint(val_); - if (yyjson_is_int(val_)) { - auto v = yyjson_get_int(val_); - return v >= 0 ? static_cast(v) : default_val; - } + auto r = elem_.get_uint64(); + if (!r.error()) return r.value_unsafe(); + auto ri = elem_.get_int64(); + if (!ri.error() && ri.value_unsafe() >= 0) + return static_cast(ri.value_unsafe()); return default_val; } else if constexpr (std::is_same_v) { - if (!val_) return default_val; - if (yyjson_is_int(val_)) return yyjson_get_int(val_); - if (yyjson_is_uint(val_)) { - auto v = yyjson_get_uint(val_); - return v <= static_cast( - std::numeric_limits::max()) - ? static_cast(v) - : default_val; - } + auto r = elem_.get_int64(); + if (!r.error()) return r.value_unsafe(); + auto ru = elem_.get_uint64(); + if (!ru.error() && + ru.value_unsafe() <= + static_cast(std::numeric_limits::max())) + return static_cast(ru.value_unsafe()); return default_val; } else if constexpr (std::is_same_v) { - if (!val_) return default_val; - if (yyjson_is_real(val_)) return yyjson_get_real(val_); - if (yyjson_is_int(val_)) - return static_cast(yyjson_get_int(val_)); - if (yyjson_is_uint(val_)) - return static_cast(yyjson_get_uint(val_)); + auto r = elem_.get_double(); + if (!r.error()) return r.value_unsafe(); + auto ri = elem_.get_int64(); + if (!ri.error()) return static_cast(ri.value_unsafe()); + auto ru = elem_.get_uint64(); + if (!ru.error()) return static_cast(ru.value_unsafe()); return default_val; } else if constexpr (std::is_same_v) { return static_cast( @@ -126,56 +132,61 @@ class JsonValue { template std::optional get_optional() const { - if (!val_) return std::nullopt; + if (!valid_) return std::nullopt; if constexpr (std::is_same_v) { - return yyjson_is_str(val_) - ? std::optional(std::string(yyjson_get_str(val_))) - : std::nullopt; + auto r = elem_.get_string(); + return r.error() ? std::nullopt + : std::optional(std::string(r.value_unsafe())); } else if constexpr (std::is_same_v) { - if (yyjson_is_str(val_)) { - const char* str = yyjson_get_str(val_); - std::size_t len = yyjson_get_len(val_); - return std::optional(std::string_view(str, len)); - } - return std::nullopt; + auto r = elem_.get_string(); + return r.error() ? std::nullopt : std::optional(r.value_unsafe()); } else if constexpr (std::is_same_v) { - return yyjson_is_str(val_) ? std::optional(yyjson_get_str(val_)) - : std::nullopt; + auto r = elem_.get_c_str(); + return r.error() ? std::nullopt : std::optional(r.value_unsafe()); } else if constexpr (std::is_same_v) { - if (yyjson_is_uint(val_)) return yyjson_get_uint(val_); - if (yyjson_is_int(val_)) { - auto v = yyjson_get_int(val_); - return v >= 0 ? std::optional(static_cast(v)) - : std::nullopt; - } + auto r = elem_.get_uint64(); + if (!r.error()) return r.value_unsafe(); + auto ri = elem_.get_int64(); + if (!ri.error() && ri.value_unsafe() >= 0) + return static_cast(ri.value_unsafe()); return std::nullopt; } else if constexpr (std::is_same_v) { - if (yyjson_is_int(val_)) return yyjson_get_int(val_); - return std::nullopt; + auto r = elem_.get_int64(); + return r.error() ? std::nullopt : std::optional(r.value_unsafe()); } else if constexpr (std::is_same_v) { - if (yyjson_is_real(val_)) return yyjson_get_real(val_); - if (yyjson_is_int(val_)) - return static_cast(yyjson_get_int(val_)); - if (yyjson_is_uint(val_)) - return static_cast(yyjson_get_uint(val_)); + auto r = elem_.get_double(); + if (!r.error()) return r.value_unsafe(); + auto ri = elem_.get_int64(); + if (!ri.error()) return static_cast(ri.value_unsafe()); + auto ru = elem_.get_uint64(); + if (!ru.error()) return static_cast(ru.value_unsafe()); return std::nullopt; } else if constexpr (std::is_same_v) { - return yyjson_is_bool(val_) ? std::optional(yyjson_get_bool(val_)) - : std::nullopt; + auto r = elem_.get_bool(); + return r.error() ? std::nullopt : std::optional(r.value_unsafe()); } else { static_assert(!sizeof(T), "Unsupported type for JsonValue::get_optional()"); } } - yyjson_val* raw() const { return val_; } - explicit operator yyjson_val*() const { return val_; } + template + void for_each_member(Fn&& fn) const { + if (!valid_ || !elem_.is_object()) return; + auto obj = elem_.get_object(); + if (obj.error()) return; + for (auto field : obj.value_unsafe()) { + fn(field.key, JsonValue(field.value)); + } + } + + simdjson::dom::element raw() const { return elem_; } explicit operator bool() const { return exists(); } }; -using JsonParserInput = yyjson_val*; using JsonParserOutput = JsonValue; +using JsonParserInput = simdjson::dom::element; class JsonParserUtility : public utilities::Utility { @@ -199,7 +210,8 @@ class StringJsonParserUtility : public utilities::Utility { private: utilities::text::Text content_; - std::shared_ptr owned_doc_; + simdjson::dom::parser parser_; + simdjson::dom::document doc_; public: coro::CoroTask process( diff --git a/include/dftracer/utils/utilities/common/json/parser.h b/include/dftracer/utils/utilities/common/json/parser.h new file mode 100644 index 00000000..3907bd7f --- /dev/null +++ b/include/dftracer/utils/utilities/common/json/parser.h @@ -0,0 +1,241 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_JSON_PARSER_H +#define DFTRACER_UTILS_UTILITIES_COMMON_JSON_PARSER_H + +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::json { + +/** + * @brief On-Demand JSON parser for zero-copy parsing. + * + * Key design principles: + * 1. On-Demand API for lazy field access - only parses what you use + * 2. Parser is reused across rows (internal buffer management) + * 3. Zero-copy: string_view points directly into the padded JSON buffer + * 4. Forward-only iteration: once a field is accessed, it's consumed + * + * Usage pattern for batch processing: + * @code + * JsonParser parser; + * + * for (auto& line : input_lines) { + * // parse() copies to internal padded buffer + * if (!parser.parse(line)) continue; + * + * // Access fields directly from parser + * auto name = parser.get_string("name"); + * auto ts = parser.get_int64("ts"); + * + * // Iterate over 'args' object + * parser.for_each_field("args", [](std::string_view key, auto& val) { + * // process nested fields + * }); + * } + * @endcode + * + * @note string_view values are only valid until the next parse() call. + */ +class JsonParser { + public: + static constexpr std::size_t DEFAULT_CAPACITY = 1 << 20; // 1MB + + explicit JsonParser(std::size_t capacity = DEFAULT_CAPACITY); + + JsonParser(const JsonParser&) = delete; + JsonParser& operator=(const JsonParser&) = delete; + JsonParser(JsonParser&&) = default; + JsonParser& operator=(JsonParser&&) = default; + + /** + * @brief Parse a JSON line. + * + * Copies the input to an internal padded buffer for SIMD processing. + * Previous parse results become invalid after this call. + * + * @param json_line The JSON string to parse. + * @return true on success, false on parse error. + */ + bool parse(std::string_view json_line); + + /** + * @brief Parse from pre-padded string (avoids copy). + */ + bool parse_padded(simdjson::padded_string_view json); + + /** + * @brief Check if current document is valid (last parse succeeded). + */ + bool is_valid() const { return valid_; } + + // Direct field access from root object + // Returns nullopt if field missing or wrong type + + std::optional get_int64(std::string_view key); + std::optional get_uint64(std::string_view key); + std::optional get_double(std::string_view key); + std::optional get_bool(std::string_view key); + std::optional get_string(std::string_view key); + + /** + * @brief Iterate over all fields in the root object. + * + * @param fn Callback: void(std::string_view key, simdjson::ondemand::value + * val) + * + * @note This consumes the document. After calling, field access methods + * will return nullopt. Call parse() again to re-parse. + */ + template + void for_each_field(Fn&& fn); + + /** + * @brief Iterate over fields of a nested object. + * + * @param object_key The field containing the nested object. + * @param fn Callback: void(std::string_view key, simdjson::ondemand::value + * val) + * @return true if object found and iterated, false otherwise. + */ + template + bool for_each_field(std::string_view object_key, Fn&& fn); + + /** + * @brief Rewind document for re-iteration. + * + * After accessing fields, the document position advances. Call this + * to reset to the beginning for another pass. + */ + void rewind(); + + /** + * @brief Get raw document for advanced usage. + */ + simdjson::ondemand::document& raw_document() { return doc_; } + + /** + * @brief Borrow an externally-owned parsed document. + * + * After this call, for_each_field/rewind/get_* operate on the borrowed + * reference. The caller must keep the underlying document alive until + * another parse() / set_borrowed_document() call. Intended for bridging + * iterate_many output (document_reference) to consumers that accept a + * JsonParser&. + */ + void set_borrowed_document( + simdjson::ondemand::document_reference ref) noexcept { + active_ = ref; + valid_ = true; + } + + private: + simdjson::ondemand::parser parser_; + simdjson::padded_string padded_json_; + simdjson::ondemand::document doc_; + simdjson::ondemand::document_reference active_; + bool valid_ = false; +}; + +// Template implementations + +template +void JsonParser::for_each_field(Fn&& fn) { + if (!valid_) return; + + auto obj_result = active_.get_object(); + if (obj_result.error()) return; + + for (auto field : obj_result.value()) { + if (field.error()) continue; + + auto key_result = field.unescaped_key(); + if (key_result.error()) continue; + + auto val_result = field.value(); + if (val_result.error()) continue; + + fn(key_result.value(), val_result.value()); + } +} + +template +bool JsonParser::for_each_field(std::string_view object_key, Fn&& fn) { + if (!valid_) return false; + + auto nested_result = active_[object_key].get_object(); + if (nested_result.error()) return false; + + for (auto field : nested_result.value()) { + if (field.error()) continue; + + auto key_result = field.unescaped_key(); + if (key_result.error()) continue; + + auto val_result = field.value(); + if (val_result.error()) continue; + + fn(key_result.value(), val_result.value()); + } + return true; +} + +/** + * @brief Helper to extract typed value from simdjson::ondemand::value. + * + * Use in for_each_field callbacks to safely extract values. + */ +struct JsonValueHelper { + static std::optional get_int64( + simdjson::ondemand::value& val) { + auto r = val.get_int64(); + return r.error() ? std::nullopt : std::optional(r.value()); + } + + static std::optional get_uint64( + simdjson::ondemand::value& val) { + auto r = val.get_uint64(); + return r.error() ? std::nullopt : std::optional(r.value()); + } + + static std::optional get_double(simdjson::ondemand::value& val) { + auto r = val.get_double(); + return r.error() ? std::nullopt : std::optional(r.value()); + } + + static std::optional get_bool(simdjson::ondemand::value& val) { + auto r = val.get_bool(); + return r.error() ? std::nullopt : std::optional(r.value()); + } + + static std::optional get_string( + simdjson::ondemand::value& val) { + auto r = val.get_string(); + return r.error() ? std::nullopt : std::optional(r.value()); + } + + static bool is_null(simdjson::ondemand::value& val) { + auto r = val.is_null(); + return r.error() ? false : r.value(); + } + + static std::optional get_type( + simdjson::ondemand::value& val) { + auto r = val.type(); + return r.error() ? std::nullopt : std::optional(r.value()); + } + + static std::optional to_json_string( + simdjson::ondemand::value& val) { + auto r = simdjson::to_json_string(val); + return r.error() ? std::nullopt : std::optional(std::string(r.value())); + } +}; + +} // namespace dftracer::utils::utilities::common::json + +#endif // DFTRACER_UTILS_UTILITIES_COMMON_JSON_PARSER_H diff --git a/include/dftracer/utils/utilities/common/query/ast.h b/include/dftracer/utils/utilities/common/query/ast.h index 2f8e3c03..b76aa97d 100644 --- a/include/dftracer/utils/utilities/common/query/ast.h +++ b/include/dftracer/utils/utilities/common/query/ast.h @@ -1,6 +1,8 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMMON_QUERY_AST_H #define DFTRACER_UTILS_UTILITIES_COMMON_QUERY_AST_H +#include + #include #include #include @@ -92,6 +94,9 @@ const char* compare_op_str(CompareOp op); /// Serialize an AST back to query DSL string. std::string to_string(const QueryNode& node); +/// Collect all field names referenced in a query AST. +dftracer::utils::StringViewSet collect_fields(const QueryNode& node); + } // namespace dftracer::utils::utilities::common::query #endif // DFTRACER_UTILS_UTILITIES_COMMON_QUERY_AST_H diff --git a/include/dftracer/utils/utilities/common/query/evaluator.h b/include/dftracer/utils/utilities/common/query/evaluator.h index f4249a9a..aa33c682 100644 --- a/include/dftracer/utils/utilities/common/query/evaluator.h +++ b/include/dftracer/utils/utilities/common/query/evaluator.h @@ -1,12 +1,10 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMMON_QUERY_EVALUATOR_H #define DFTRACER_UTILS_UTILITIES_COMMON_QUERY_EVALUATOR_H +#include #include #include -#include -#include - namespace dftracer::utils::utilities::common::query { using json::JsonValue; @@ -15,7 +13,7 @@ using json::JsonValue; bool evaluate(const QueryNode& node, const JsonValue& event); /// Typed key-value map for non-JSON evaluation contexts. -using ValueMap = std::unordered_map; +using ValueMap = dftracer::utils::StringViewMap; /// Evaluate against a typed key-value map. /// Missing fields evaluate to false. diff --git a/include/dftracer/utils/utilities/common/query/query.h b/include/dftracer/utils/utilities/common/query/query.h index 2c9938bf..d60af516 100644 --- a/include/dftracer/utils/utilities/common/query/query.h +++ b/include/dftracer/utils/utilities/common/query/query.h @@ -1,6 +1,7 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMMON_QUERY_QUERY_H #define DFTRACER_UTILS_UTILITIES_COMMON_QUERY_QUERY_H +#include #include #include #include @@ -33,13 +34,21 @@ class Query { const std::string& source() const { return source_; } /// Serialize AST back to query DSL string. std::string to_string() const; + /// Fields referenced by this query, precomputed at construction. + const dftracer::utils::StringViewSet& fields() const { return fields_; } + bool references(std::string_view field) const { + return fields_.count(field) > 0; + } private: Query(QueryNodePtr root, std::string source) - : root_(std::move(root)), source_(std::move(source)) {} + : root_(std::move(root)), + source_(std::move(source)), + fields_(collect_fields(*root_)) {} QueryNodePtr root_; std::string source_; + dftracer::utils::StringViewSet fields_; }; /// Parse a query string, throwing QueryParseError on failure. diff --git a/include/dftracer/utils/utilities/common/serialization/binary_codec.h b/include/dftracer/utils/utilities/common/serialization/binary_codec.h new file mode 100644 index 00000000..bd7652f0 --- /dev/null +++ b/include/dftracer/utils/utilities/common/serialization/binary_codec.h @@ -0,0 +1,210 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_SERIALIZATION_BINARY_CODEC_H +#define DFTRACER_UTILS_UTILITIES_COMMON_SERIALIZATION_BINARY_CODEC_H + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::serialization { + +// ============================================================================= +// Binary Writer Utilities +// ============================================================================= + +inline void put_u8(std::string& out, std::uint8_t v) { + out.push_back(static_cast(v)); +} + +inline void put_be16(std::string& out, std::uint16_t v) { + out.push_back(static_cast(v >> 8)); + out.push_back(static_cast(v)); +} + +inline void put_be32(std::string& out, std::uint32_t v) { + out.push_back(static_cast(v >> 24)); + out.push_back(static_cast(v >> 16)); + out.push_back(static_cast(v >> 8)); + out.push_back(static_cast(v)); +} + +inline void put_be64(std::string& out, std::uint64_t v) { + out.push_back(static_cast(v >> 56)); + out.push_back(static_cast(v >> 48)); + out.push_back(static_cast(v >> 40)); + out.push_back(static_cast(v >> 32)); + out.push_back(static_cast(v >> 24)); + out.push_back(static_cast(v >> 16)); + out.push_back(static_cast(v >> 8)); + out.push_back(static_cast(v)); +} + +inline void put_double(std::string& out, double v) { + std::uint64_t bits; + std::memcpy(&bits, &v, 8); + put_be64(out, bits); +} + +inline void put_str(std::string& out, std::string_view s) { + put_be16(out, static_cast(s.size())); + out.append(s.data(), s.size()); +} + +inline void put_varint(std::string& out, std::uint64_t v) { + while (v >= 0x80) { + out.push_back(static_cast(v | 0x80)); + v >>= 7; + } + out.push_back(static_cast(v)); +} + +inline void put_blob(std::string& out, std::span data) { + put_be32(out, static_cast(data.size())); + out.append(reinterpret_cast(data.data()), data.size()); +} + +// ============================================================================= +// Raw Pointer Writer Utilities (for pre-sized buffers) +// ============================================================================= + +inline char* write_varint(char* p, std::uint64_t v) { + while (v >= 0x80) { + *p++ = static_cast(v | 0x80); + v >>= 7; + } + *p++ = static_cast(v); + return p; +} + +inline char* write_be16(char* p, std::uint16_t v) { + p[0] = static_cast(v >> 8); + p[1] = static_cast(v); + return p + 2; +} + +inline char* write_be32(char* p, std::uint32_t v) { + p[0] = static_cast(v >> 24); + p[1] = static_cast(v >> 16); + p[2] = static_cast(v >> 8); + p[3] = static_cast(v); + return p + 4; +} + +inline char* write_be64(char* p, std::uint64_t v) { + p[0] = static_cast(v >> 56); + p[1] = static_cast(v >> 48); + p[2] = static_cast(v >> 40); + p[3] = static_cast(v >> 32); + p[4] = static_cast(v >> 24); + p[5] = static_cast(v >> 16); + p[6] = static_cast(v >> 8); + p[7] = static_cast(v); + return p + 8; +} + +inline char* write_double(char* p, double v) { + std::uint64_t bits; + std::memcpy(&bits, &v, 8); + return write_be64(p, bits); +} + +inline char* write_str(char* p, std::string_view s) { + const auto n = static_cast(s.size()); + p[0] = static_cast(n >> 8); + p[1] = static_cast(n); + p += 2; + std::memcpy(p, s.data(), s.size()); + return p + s.size(); +} + +// ============================================================================= +// Binary Reader Class +// ============================================================================= + +class BinaryReader { + public: + explicit BinaryReader(std::string_view data) : data_(data) {} + + std::uint8_t u8() { return static_cast(take(1)[0]); } + + std::uint16_t be16() { + auto s = take(2); + return static_cast( + (static_cast(s[0]) << 8) | + static_cast(s[1])); + } + + std::uint32_t be32() { + auto s = take(4); + return (static_cast(static_cast(s[0])) + << 24) | + (static_cast(static_cast(s[1])) + << 16) | + (static_cast(static_cast(s[2])) + << 8) | + static_cast(static_cast(s[3])); + } + + std::uint64_t be64() { + auto s = take(8); + std::uint64_t v = 0; + for (int i = 0; i < 8; ++i) { + v = (v << 8) | static_cast(s[i]); + } + return v; + } + + double f64() { + std::uint64_t bits = be64(); + double v; + std::memcpy(&v, &bits, 8); + return v; + } + + std::string_view blob() { + auto len = be32(); + return take(len); + } + + std::string_view str() { + auto len = be16(); + return take(len); + } + + std::uint64_t varint() { + std::uint64_t v = 0; + unsigned shift = 0; + while (off_ < data_.size()) { + auto b = static_cast(data_[off_++]); + v |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) return v; + shift += 7; + } + throw std::runtime_error("binary_codec: truncated varint"); + } + + bool has_remaining() const { return off_ < data_.size(); } + + std::string_view remaining() const { return data_.substr(off_); } + + std::size_t offset() const { return off_; } + + private: + std::string_view take(std::size_t n) { + if (off_ + n > data_.size()) { + throw std::runtime_error("binary_codec: truncated data"); + } + auto s = data_.substr(off_, n); + off_ += n; + return s; + } + + std::string_view data_; + std::size_t off_ = 0; +}; + +} // namespace dftracer::utils::utilities::common::serialization + +#endif // DFTRACER_UTILS_UTILITIES_COMMON_SERIALIZATION_BINARY_CODEC_H diff --git a/include/dftracer/utils/utilities/common/statistics/log2_histogram.h b/include/dftracer/utils/utilities/common/statistics/log2_histogram.h index 9cb9da9e..262ce62c 100644 --- a/include/dftracer/utils/utilities/common/statistics/log2_histogram.h +++ b/include/dftracer/utils/utilities/common/statistics/log2_histogram.h @@ -5,10 +5,6 @@ #include #include -// Forward declaration for direct JSON serialization -struct yyjson_mut_doc; -struct yyjson_mut_val; - namespace dftracer::utils::utilities::common::statistics { /** @@ -36,9 +32,6 @@ class Log2Histogram { std::string to_json() const; static Log2Histogram from_json(const std::string& json); - /// Serialize directly to yyjson mutable array (avoids string roundtrip) - yyjson_mut_val* to_yyjson(yyjson_mut_doc* doc) const; - std::uint64_t total_count() const { return total_count_; } const std::array& bins() const { return bins_; } diff --git a/include/dftracer/utils/utilities/common/statistics/timestamp_histogram.h b/include/dftracer/utils/utilities/common/statistics/timestamp_histogram.h new file mode 100644 index 00000000..16cea6f0 --- /dev/null +++ b/include/dftracer/utils/utilities/common/statistics/timestamp_histogram.h @@ -0,0 +1,59 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_TIMESTAMP_HISTOGRAM_H +#define DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_TIMESTAMP_HISTOGRAM_H + +#include +#include +#include + +namespace dftracer::utils::utilities::common::statistics { + +class TimestampHistogram { + public: + static constexpr std::uint64_t BIN_WIDTH_US = 100'000; // 100ms + + TimestampHistogram() = default; + + void add(std::uint64_t timestamp_us); + void merge(const TimestampHistogram& other); + + std::uint64_t count_in_range(std::uint64_t ts_start_us, + std::uint64_t ts_end_us) const; + double selectivity(std::uint64_t ts_start_us, + std::uint64_t ts_end_us) const; + std::vector expansion_weights(std::uint64_t bucket_start_us, + std::uint64_t bucket_end_us, + std::size_t num_sub_buckets) const; + + std::vector serialize() const; + static TimestampHistogram deserialize(const std::uint8_t* data, + std::size_t len); + + std::uint64_t total_count() const { return total_count_; } + bool empty() const { return bins_.empty(); } + std::size_t num_bins() const { return bins_.size(); } + + const std::vector>& bins() const { + return bins_; + } + + static std::uint64_t bin_index(std::uint64_t timestamp_us) { + return timestamp_us / BIN_WIDTH_US; + } + + static std::uint64_t bin_start_us(std::uint64_t bin_idx) { + return bin_idx * BIN_WIDTH_US; + } + + static std::uint64_t bin_end_us(std::uint64_t bin_idx) { + return (bin_idx + 1) * BIN_WIDTH_US; + } + + private: + // Sorted by bin_index. Sparse: only non-zero bins stored. + std::vector> bins_; + std::uint64_t total_count_ = 0; +}; + +} // namespace dftracer::utils::utilities::common::statistics + +#endif // DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_TIMESTAMP_HISTOGRAM_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h new file mode 100644 index 00000000..38f3e0f5 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h @@ -0,0 +1,33 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_AUGMENTATION_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_AUGMENTATION_H + +#include + +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +struct AugmentationConfig { + std::uint64_t source_interval_us; // interval stored in index + std::uint64_t target_interval_us; // interval requested by user +}; + +// Augment a batch to match target interval. +// - If source > target: expand (split buckets, approximate with CI) +// - If source < target: shrink (merge buckets, lossless) +// - If source == target: pass through +AggregationBatch augment_batch(const AggregationBatch& input, + const AugmentationConfig& config); + +// Compute Poisson 95% confidence interval for a count +inline CountConfidenceInterval compute_poisson_ci(double count, + double confidence = 1.96) { + double sqrt_count = std::sqrt(count); + return {std::max(0.0, count - confidence * sqrt_count), + count + confidence * sqrt_count}; +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_AUGMENTATION_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h index 8b3d8d7a..142af76e 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h @@ -1,6 +1,9 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_CONFIG_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_CONFIG_H +#include +#include + #include #include #include @@ -17,10 +20,13 @@ struct AggregationConfig { std::uint64_t time_interval_us = 1000000; bool use_relative_time = false; std::uint64_t reference_timestamp = 0; + bool normalize_time = false; // Normalize time_bucket to 0-based time_range std::vector extra_group_keys; std::vector custom_metric_fields; + bool track_default_args = true; + bool compute_statistics = true; bool compute_percentiles = false; @@ -50,6 +56,46 @@ struct AggregationConfig { #endif return s; } + + // Compute hash of config fields that affect aggregation output. + // Used to detect if cached aggregation data matches current config. + std::uint32_t compute_hash() const { + hash::Fnv1aHashBuilder h; + + h.update_value(time_interval_us); + h.update_value(use_relative_time); + if (use_relative_time) { + h.update_value(reference_timestamp); + } + + for (const auto& k : extra_group_keys) { + h.update(k); + } + for (const auto& m : custom_metric_fields) { + h.update(m); + } + + h.update_value(track_default_args); + h.update_value(compute_statistics); + h.update_value(compute_percentiles); + + if (compute_percentiles) { + h.update_value(sketch_accuracy); + for (double p : percentiles) { + h.update_value(p); + } + } + + for (const auto& be : boundary_events) { + h.update(be.event_name); + h.update(be.value_field); + h.update(be.output_name); + } + + h.update_value(track_process_parents); + + return h.finish32(); + } }; } // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h new file mode 100644 index 00000000..f8663000 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h @@ -0,0 +1,28 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_LOGIC_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_LOGIC_H + +#include +#include +#include +#include +#include + +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +std::uint64_t compute_time_bucket(std::uint64_t timestamp, + std::uint64_t duration, + const AggregationConfig& config); + +AggregationKey build_aggregation_key(const DFTracerEvent& ev, + const AggregationConfig& config); + +void update_aggregation_entry(const DFTracerEvent& ev, + const AggregationConfig& config, + AggregationMap& aggregations, + const AggregationKey& key); + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_LOGIC_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h new file mode 100644 index 00000000..e3bd4722 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h @@ -0,0 +1,26 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_MERGE_OPERATOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_MERGE_OPERATOR_H + +#include + +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +class AggregationMergeOperator : public ::rocksdb::MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMerge(const ::rocksdb::Slice& key, + const ::rocksdb::Slice& left_operand, + const ::rocksdb::Slice& right_operand, + std::string* new_value, + ::rocksdb::Logger* logger) const override; + + const char* Name() const override { return "AggregationMergeOperator"; } +}; + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_MERGE_OPERATOR_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h index a2147de6..e3fb9576 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h @@ -16,6 +16,7 @@ namespace dftracer::utils::utilities::composites::dft::aggregators { using common::statistics::DDSketch; struct MetricStats { + std::uint64_t count = 0; std::uint64_t total = 0; std::uint64_t min = std::numeric_limits::max(); std::uint64_t max = 0; @@ -30,7 +31,8 @@ struct MetricStats { : sketch_accuracy_(relative_accuracy) {} MetricStats(const MetricStats& other) - : total(other.total), + : count(other.count), + total(other.total), min(other.min), max(other.max), mean(other.mean), @@ -43,6 +45,7 @@ struct MetricStats { MetricStats& operator=(const MetricStats& other) { if (this != &other) { + count = other.count; total = other.total; min = other.min; max = other.max; @@ -60,13 +63,11 @@ struct MetricStats { MetricStats(MetricStats&&) = default; MetricStats& operator=(MetricStats&&) = default; - void update(std::uint64_t value, std::uint64_t count, - bool compute_percentiles = false); - void merge_from(const MetricStats& other, std::uint64_t n1, - std::uint64_t n2, std::uint64_t n); - double get_stddev(std::uint64_t count) const; - double get_skewness(std::uint64_t count) const; - double get_kurtosis(std::uint64_t count) const; + void update(std::uint64_t value, bool compute_percentiles = false); + void merge_from(const MetricStats& other); + double get_stddev() const; + double get_skewness() const; + double get_kurtosis() const; }; using CustomMetricsMap = @@ -146,13 +147,9 @@ struct AggregationMetrics { void update_timestamp_clamped(std::uint64_t event_ts, std::uint64_t dur, std::uint64_t bucket_start, std::uint64_t bucket_size); - void update_custom_metric(const std::string& name, std::uint64_t value, + void update_custom_metric(std::string_view name, std::uint64_t value, bool compute_percentiles = false); - double get_stddev_duration() const; - double get_stddev_size() const; - double get_custom_stddev(const std::string& name) const; - void merge_from(const AggregationMetrics& other); }; diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h index 06cf4bbd..f103c32e 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -12,6 +13,12 @@ namespace dftracer::utils::utilities::composites::dft::aggregators { +enum class AggMapType : std::uint8_t { + EVENT = 0, + PROFILE = 1, + SYSTEM = 2, +}; + class AssociationTracker; struct BoundaryTimeRange { @@ -33,9 +40,11 @@ struct ChunkAggregationOutput { std::string file_path; bool success = false; std::shared_ptr local_tracker; + std::uint64_t min_time_bucket = std::numeric_limits::max(); + std::uint64_t max_time_bucket = 0; }; -struct EventAggregatorUtilityOutput { +struct EventAggregatorOutput { AggregationMap aggregations; AggregationMap profile_aggregations; AggregationMap system_aggregations; diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h new file mode 100644 index 00000000..08d596a3 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h @@ -0,0 +1,356 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_SERIALIZATION_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_SERIALIZATION_H + +#include +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +static constexpr std::uint16_t AGG_KEY_NUM_SHARDS = 4096; + +static constexpr std::uint8_t METRIC_FMT_COMPACT = 0; +static constexpr std::uint8_t METRIC_FMT_FULL = 1; +static constexpr std::uint8_t METRIC_FMT_FULL_WITH_SKETCH = 2; + +// Intern dictionary: 0xFFFD + varint(id) -> string value +static constexpr char AGG_INTERN_DICT_PREFIX[] = "\xFF\xFD"; +static constexpr std::size_t AGG_INTERN_DICT_PREFIX_LEN = 2; + +// Global config: 0xFFFE -> time_interval_us (8) + config_hash (4) +static constexpr char AGG_GLOBAL_CONFIG_KEY[] = "\xFF\xFE"; +static constexpr std::size_t AGG_GLOBAL_CONFIG_LEN = 12; + +struct AggGlobalConfig { + std::uint64_t time_interval_us = 0; + std::uint32_t config_hash = 0; +}; + +inline std::string serialize_agg_global_config(const AggGlobalConfig& cfg) { + std::string val(AGG_GLOBAL_CONFIG_LEN, '\0'); + val[0] = static_cast((cfg.time_interval_us >> 56) & 0xFF); + val[1] = static_cast((cfg.time_interval_us >> 48) & 0xFF); + val[2] = static_cast((cfg.time_interval_us >> 40) & 0xFF); + val[3] = static_cast((cfg.time_interval_us >> 32) & 0xFF); + val[4] = static_cast((cfg.time_interval_us >> 24) & 0xFF); + val[5] = static_cast((cfg.time_interval_us >> 16) & 0xFF); + val[6] = static_cast((cfg.time_interval_us >> 8) & 0xFF); + val[7] = static_cast(cfg.time_interval_us & 0xFF); + val[8] = static_cast((cfg.config_hash >> 24) & 0xFF); + val[9] = static_cast((cfg.config_hash >> 16) & 0xFF); + val[10] = static_cast((cfg.config_hash >> 8) & 0xFF); + val[11] = static_cast(cfg.config_hash & 0xFF); + return val; +} + +inline AggGlobalConfig deserialize_agg_global_config(std::string_view data) { + AggGlobalConfig cfg; + if (data.size() >= AGG_GLOBAL_CONFIG_LEN) { + cfg.time_interval_us = + (static_cast(static_cast(data[0])) + << 56) | + (static_cast(static_cast(data[1])) + << 48) | + (static_cast(static_cast(data[2])) + << 40) | + (static_cast(static_cast(data[3])) + << 32) | + (static_cast(static_cast(data[4])) + << 24) | + (static_cast(static_cast(data[5])) + << 16) | + (static_cast(static_cast(data[6])) + << 8) | + static_cast(static_cast(data[7])); + cfg.config_hash = + (static_cast(static_cast(data[8])) + << 24) | + (static_cast(static_cast(data[9])) + << 16) | + (static_cast(static_cast(data[10])) + << 8) | + static_cast(static_cast(data[11])); + } + return cfg; +} + +// Per-file: 0xFFFF + file_id (4) -> empty value (presence = aggregated) +static constexpr char AGG_FILE_KEY_PREFIX[] = "\xFF\xFF"; +static constexpr std::size_t AGG_FILE_KEY_PREFIX_LEN = 2; +static constexpr std::size_t AGG_FILE_KEY_LEN = + AGG_FILE_KEY_PREFIX_LEN + sizeof(std::int32_t); + +inline std::string make_agg_file_key(std::int32_t file_id) { + std::string key(AGG_FILE_KEY_LEN, '\0'); + key[0] = AGG_FILE_KEY_PREFIX[0]; + key[1] = AGG_FILE_KEY_PREFIX[1]; + key[2] = static_cast((file_id >> 24) & 0xFF); + key[3] = static_cast((file_id >> 16) & 0xFF); + key[4] = static_cast((file_id >> 8) & 0xFF); + key[5] = static_cast(file_id & 0xFF); + return key; +} + +void serialize_agg_key_into(std::string& out, std::uint32_t config_hash, + AggMapType map_type, const AggregationKey& key); + +void serialize_agg_key_into( + std::string& out, std::uint32_t config_hash, AggMapType map_type, + std::string_view cat, std::string_view name, std::uint64_t pid, + std::uint64_t tid, std::string_view hhash, std::string_view fhash, + std::uint64_t time_bucket, + const std::vector>* + extra_keys = nullptr); +std::string serialize_agg_key(std::uint32_t config_hash, AggMapType map_type, + const AggregationKey& key); + +struct DeserializedAggKey { + std::uint32_t config_hash; + AggMapType map_type; + AggregationKey key; +}; +DeserializedAggKey deserialize_agg_key(std::string_view data); + +/// Key view with resolved strings from the intern table. +/// Lifetime: valid as long as aggregation_intern() exists (process lifetime). +struct AggKeyView { + AggMapType map_type; + std::string_view cat; + std::string_view name; + std::uint64_t pid; + std::uint64_t tid; + std::string_view hhash; + std::string_view fhash; + std::uint64_t time_bucket; +}; + +/// Parse aggregation key: reads varint intern IDs and resolves to strings. +/// Returns false if parsing fails. +inline bool parse_agg_key_view(std::string_view data, AggKeyView& out) { + if (data.size() < 6) return false; + + const auto* p = reinterpret_cast(data.data()); + const auto* end = p + data.size(); + + p += 2; // shard + + out.map_type = static_cast(*p++); + + auto read_varint = [&]() -> std::uint64_t { + std::uint64_t v = 0; + unsigned shift = 0; + while (p < end) { + auto b = *p++; + v |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) return v; + shift += 7; + } + return v; + }; + + auto& intern = aggregation_intern(); + auto cat_id = static_cast(read_varint()); + auto name_id = static_cast(read_varint()); + out.pid = read_varint(); + out.tid = read_varint(); + auto hhash_id = static_cast(read_varint()); + auto fhash_id = static_cast(read_varint()); + out.time_bucket = read_varint(); + + out.cat = intern.resolve(cat_id); + out.name = intern.resolve(name_id); + out.hhash = hhash_id ? intern.resolve(hhash_id) : std::string_view{}; + out.fhash = fhash_id ? intern.resolve(fhash_id) : std::string_view{}; + + return true; +} + +void serialize_agg_value_into(std::string& out, + const AggregationMetrics& metrics); +std::string serialize_agg_value(const AggregationMetrics& metrics); +AggregationMetrics deserialize_agg_value(std::string_view data); + +/// Lightweight metrics view for Arrow export - only the fields needed. +struct AggMetricsView { + std::uint64_t count; + std::uint64_t dur_total; + std::uint64_t dur_min; + std::uint64_t dur_max; + std::uint64_t size_total; + std::uint64_t size_min; + std::uint64_t size_max; + std::uint64_t ts; + std::uint64_t te; +}; + +/// Full metrics view including mean/m2 for stddev computation. +/// Use for iter_aggregation which needs mean and stddev columns. +struct AggMetricsFullView { + std::uint64_t count; + std::uint64_t dur_total; + std::uint64_t dur_min; + std::uint64_t dur_max; + double dur_mean; + double dur_m2; // For Welford's stddev: stddev = sqrt(m2 / count) + std::uint64_t size_total; + std::uint64_t size_min; + std::uint64_t size_max; + double size_mean; + double size_m2; + std::uint64_t ts; + std::uint64_t te; + + double dur_stddev() const { + return count > 1 ? std::sqrt(dur_m2 / static_cast(count)) : 0.0; + } + double size_stddev() const { + return count > 1 ? std::sqrt(size_m2 / static_cast(count)) + : 0.0; + } +}; + +/// Fast value parser for Arrow export - skips mean/m2/m3/m4/sketch. +inline bool parse_agg_value_view(std::string_view data, AggMetricsView& out) { + const auto* p = reinterpret_cast(data.data()); + const auto* end = p + data.size(); + + auto read_varint = [&]() -> std::uint64_t { + std::uint64_t v = 0; + int shift = 0; + while (p < end) { + std::uint8_t b = *p++; + v |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) break; + shift += 7; + } + return v; + }; + + auto skip_f64 = [&]() { p += 8; }; + + auto read_metric_stats_partial = + [&](std::uint64_t& total, std::uint64_t& min, std::uint64_t& max) { + auto fmt = read_varint(); + if (fmt == METRIC_FMT_COMPACT) { + auto val = read_varint(); + total = min = max = val; + return; + } + read_varint(); // skip count + total = read_varint(); + min = read_varint(); + max = read_varint(); + skip_f64(); // mean + skip_f64(); // m2 + if (fmt == METRIC_FMT_FULL_WITH_SKETCH) { + auto len = read_varint(); + p += len; + } + }; + + if (p >= end) return false; + + out.count = read_varint(); + read_metric_stats_partial(out.dur_total, out.dur_min, out.dur_max); + read_metric_stats_partial(out.size_total, out.size_min, out.size_max); + out.ts = read_varint(); + out.te = read_varint(); + + return true; +} + +/// Full value parser for iter_aggregation - includes mean/m2 for stddev. +inline bool parse_agg_value_full_view(std::string_view data, + AggMetricsFullView& out) { + const auto* p = reinterpret_cast(data.data()); + const auto* end = p + data.size(); + + auto read_varint = [&]() -> std::uint64_t { + std::uint64_t v = 0; + int shift = 0; + while (p < end) { + std::uint8_t b = *p++; + v |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) break; + shift += 7; + } + return v; + }; + + auto read_f64 = [&]() -> double { + if (p + 8 > end) return 0.0; + std::uint64_t bits = 0; + for (int i = 0; i < 8; ++i) { + bits |= static_cast(*p++) << (i * 8); + } + double result; + std::memcpy(&result, &bits, sizeof(result)); + return result; + }; + + auto read_metric_stats_full = [&](std::uint64_t& total, std::uint64_t& min, + std::uint64_t& max, double& mean, + double& m2) { + auto fmt = read_varint(); + if (fmt == METRIC_FMT_COMPACT) { + auto val = read_varint(); + total = min = max = val; + mean = static_cast(val); + m2 = 0.0; + return; + } + read_varint(); // skip count (use outer count) + total = read_varint(); + min = read_varint(); + max = read_varint(); + mean = read_f64(); + m2 = read_f64(); + if (fmt == METRIC_FMT_FULL_WITH_SKETCH) { + auto len = read_varint(); + p += len; + } + }; + + if (p >= end) return false; + + out.count = read_varint(); + read_metric_stats_full(out.dur_total, out.dur_min, out.dur_max, + out.dur_mean, out.dur_m2); + read_metric_stats_full(out.size_total, out.size_min, out.size_max, + out.size_mean, out.size_m2); + out.ts = read_varint(); + out.te = read_varint(); + + return true; +} + +/// Load intern dictionary from RocksDB into aggregation_intern(). +void load_intern_dictionary(dftracer::utils::rocksdb::RocksDatabase& db); + +/// Flush any new intern entries to RocksDB as 0xFFFD keys. +void flush_intern_dictionary( + dftracer::utils::rocksdb::RocksDatabase& db, + dftracer::utils::rocksdb::RocksDatabase::Batch& batch); + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +namespace dftracer::utils::utilities::indexer { +class IndexBatchSink; +} + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +/// Sink-backed overload: flushes new intern entries via +/// `IndexBatchSink::insert_aggregation_put`. Used by the distributed SST +/// pipeline where the visitor writes to an SST instead of a live DB. +void flush_intern_dictionary( + dftracer::utils::utilities::indexer::IndexBatchSink& sink); + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_SERIALIZATION_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h new file mode 100644 index 00000000..bdc9b8e2 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h @@ -0,0 +1,130 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_VISITOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_VISITOR_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +class AggregationVisitor : public DftEventVisitor { + public: + /// Legacy mode: flush directly to a live RocksDatabase via Merge/Put + /// during parse. FLUSH_THRESHOLD commits the visitor's own batch to + /// bound memory. Used by `aggregator_utility.cpp`, + /// `dftracer_aggregator.cpp`, `dftracer_organize.cpp`. + AggregationVisitor(std::shared_ptr db, + std::uint32_t config_hash, AggregationConfig config, + std::string file_path); + + /// Distributed mode: flush to a per-visitor SstWriterContext rooted at + /// `staging_dir`. FLUSH_THRESHOLD emits partial SSTs (mixed Put+Merge) + /// so the in-memory map never exceeds the threshold. At + /// `on_file_complete`, the writer context is committed and its + /// Artifacts are embedded in the ChunkAggregationOutput so the worker + /// / coordinator can forward them to the main `SstArtifactRegistry`. + /// + /// `staging_dir` is typically the same node-local dir the rest of the + /// SST pipeline uses. `batch_id_prefix` is joined with a per-file + /// suffix to form a unique SstWriterContext root (so concurrent + /// per-file visitors never collide). + AggregationVisitor(std::string staging_dir, std::string batch_id_prefix, + std::uint32_t config_hash, AggregationConfig config, + std::string file_path); + + void begin(std::size_t num_checkpoints) override; + void on_checkpoint(std::size_t checkpoint_idx) override; + void on_event(const EventRecord& record) override; + coro::CoroTask on_file_complete() override; + bool needs_args_map() const override { return true; } + + ChunkAggregationOutput take_output(); + void flush_to_batch(rocksdb::RocksDatabase::Batch& batch); + + const std::unordered_set& observed_extra_keys() const { + return observed_extra_keys_; + } + const std::unordered_set& observed_custom_metrics() const { + return observed_custom_metrics_; + } + + /// Distributed mode only: one or more per-flush SST artifact sets + /// produced by this visitor after `on_file_complete`. Each flush + /// emits its own SST(s) because `SstFileWriter` requires strictly + /// ascending keys and merge operands for the same key across flushes + /// would violate that invariant. Empty in legacy mode. + std::vector& + aggregation_artifacts() noexcept { + return sst_artifacts_; + } + + private: + void seal_local_buffer(); + void handle_system_event(const EventRecord& record); + + // Legacy (RocksDatabase-backed) mode. + std::shared_ptr db_; + std::vector pending_batches_; + + // Distributed (SST-backed) mode. The visitor rotates sst_sink_ per + // flush to keep each SST's key space strictly ascending (merge + // operands for the same key across flushes must live in separate + // SSTs). `sst_staging_dir_` and `sst_batch_prefix_` persist across + // rotations so the next SstWriterContext can be constructed at + // flush time. + std::unique_ptr sst_sink_; + std::string sst_staging_dir_; + std::string sst_batch_prefix_; + std::size_t sst_flush_counter_ = 0; + std::vector + sst_artifacts_; + + std::uint32_t config_hash_; + AggregationConfig config_; + std::string file_path_; + + std::shared_ptr tracker_; + std::size_t events_processed_ = 0; + + static constexpr std::size_t FLUSH_THRESHOLD = 65536; + std::unordered_map + local_buffer_; + std::string key_buf_; + std::string val_buf_; + + AggregationMetrics* last_entry_ = nullptr; + std::string_view last_key_; + + // System metrics buffer (keyed by hhash + time_bucket) + std::unordered_map + system_buffer_; + std::string system_key_buf_; + std::string system_val_buf_; + + std::unordered_set observed_extra_keys_; + std::unordered_set observed_custom_metrics_; + std::unordered_set observed_system_metrics_; + + std::uint64_t min_time_bucket_ = std::numeric_limits::max(); + std::uint64_t max_time_bucket_ = 0; +}; + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_VISITOR_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.h index ad4b7e2b..bdd9fc5f 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.h @@ -6,7 +6,7 @@ namespace dftracer::utils::utilities::composites::dft::aggregators { -using AggregatorSummaryInput = EventAggregatorUtilityOutput; +using AggregatorSummaryInput = EventAggregatorOutput; using AggregatorSummaryOutput = void; class AggregatorSummaryUtility diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h new file mode 100644 index 00000000..9ef45743 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h @@ -0,0 +1,164 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATOR_TYPES_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATOR_TYPES_H + +#include +#include +#include +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW +#include +#endif + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +/// Context for converting aggregation data to dfanalyzer-compatible Arrow +/// format. +struct DfanalyzerContext { + /// Hash tables for resolving fhash/hhash to file_name/host_name. + const std::unordered_map* file_hashes = nullptr; + const std::unordered_map* host_hashes = nullptr; + + const common::query::Query* query_filter = nullptr; + + /// Time origin (minimum time_bucket) for normalization. + std::uint64_t time_origin = 0; + + /// Time resolution (microseconds per output unit, default 1e6 = seconds). + double time_resolution = 1e6; + + /// Time granularity in seconds (bucket width for time_range computation). + double time_granularity = 1.0; +}; + +enum class AggregationBatchType { EVENT, PROFILE, SYSTEM }; + +struct CountConfidenceInterval { + double lower = 0.0; + double upper = 0.0; +}; + +struct AggregationEntry { + AggregationKey key; + AggregationMetrics metrics; + bool is_approximated = false; + CountConfidenceInterval count_ci; + + AggregationEntry() = default; + AggregationEntry(AggregationKey k, AggregationMetrics m) + : key(std::move(k)), metrics(std::move(m)) {} + + /// Create a ValueMap from the key and metrics for query evaluation. + /// Includes cat, name, pid, tid, hhash, fhash, time_bucket, extra_keys, + /// and aggregation metrics (count, dur_total, dur_min, dur_max, etc.). + common::query::ValueMap to_value_map() const { + common::query::ValueMap fields; + // Key fields + fields["cat"] = std::string(key.cat()); + fields["name"] = std::string(key.name()); + fields["pid"] = static_cast(key.pid); + fields["tid"] = static_cast(key.tid); + if (!key.hhash().empty()) { + fields["hhash"] = std::string(key.hhash()); + } + if (!key.fhash().empty()) { + fields["fhash"] = std::string(key.fhash()); + } + fields["time_bucket"] = key.time_bucket; + // Include extra_keys (args fields used for grouping) + if (key.extra_keys) { + for (const auto& [key_id, value_id] : *key.extra_keys) { + auto key_str = + std::string(aggregation_intern().resolve(key_id)); + auto value_str = + std::string(aggregation_intern().resolve(value_id)); + fields[key_str] = value_str; + } + } + // Aggregation metrics + fields["count"] = metrics.count; + fields["dur_total"] = metrics.duration.total; + fields["dur_min"] = metrics.duration.min; + fields["dur_max"] = metrics.duration.max; + fields["dur_mean"] = metrics.duration.mean; + fields["size_total"] = metrics.size.total; + fields["size_min"] = metrics.size.min; + fields["size_max"] = metrics.size.max; + fields["size_mean"] = metrics.size.mean; + fields["ts"] = metrics.ts; + fields["te"] = metrics.te; + // Custom metrics (arbitrary args fields aggregated as numeric stats) + if (metrics.custom_metrics) { + for (const auto& [name, stats] : *metrics.custom_metrics) { + fields[name + "_total"] = stats.total; + fields[name + "_min"] = stats.min; + fields[name + "_max"] = stats.max; + fields[name + "_mean"] = stats.mean; + } + } + return fields; + } + + /// Check if this entry matches a query. + bool matches(const common::query::Query& query) const { + return query.evaluate(to_value_map()); + } +}; + +struct AggregationBatch { + std::vector entries; + AggregationBatchType batch_type = AggregationBatchType::EVENT; + std::size_t total_events_processed = 0; + std::size_t total_files_processed = 0; + std::size_t total_bytes_processed = 0; + bool has_approximated_entries = false; + + // When set, to_arrow() uses these instead of discovering from entries. + // All batches in an IPC file must use the same columns for a consistent + // schema. + const std::vector* global_extra_key_ids = nullptr; + const std::vector* global_custom_metric_names = nullptr; + + /// Filter entries by query, returning a new batch with matching entries. + AggregationBatch filter(const common::query::Query& query) const { + AggregationBatch filtered; + filtered.batch_type = batch_type; + filtered.total_events_processed = total_events_processed; + filtered.total_files_processed = total_files_processed; + filtered.total_bytes_processed = total_bytes_processed; + filtered.has_approximated_entries = has_approximated_entries; + filtered.global_extra_key_ids = global_extra_key_ids; + filtered.global_custom_metric_names = global_custom_metric_names; + + for (const auto& entry : entries) { + if (entry.matches(query)) { + filtered.entries.push_back(entry); + } + } + return filtered; + } + +#ifdef DFTRACER_UTILS_ENABLE_ARROW + common::arrow::ArrowExportResult to_arrow() const; + + /// Convert to dfanalyzer-compatible Arrow format. + /// Outputs columns matching dfanalyzer schema: + /// - Events/Profiles: cat, func_name, pid, tid, file_hash, host_hash, + /// file_name, host_name, proc_name, io_cat, acc_pat, count, time, size, + /// time_min, time_max, size_min, size_max, time_range, time_start, + /// time_end + /// - System: host_hash, time_range, sys_cpu_*, sys_mem_* + common::arrow::ArrowExportResult to_dfanalyzer_arrow( + const DfanalyzerContext& ctx) const; +#endif +}; + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATOR_TYPES_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h index 6c9068cf..f68b8ed2 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h @@ -2,21 +2,14 @@ #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATOR_UTILITY_H #include +#include #include #include -#include -#include - -#ifdef DFTRACER_UTILS_ENABLE_ARROW -#include -#endif +#include #include -#include #include #include -#include -#include namespace dftracer::utils::utilities::composites::dft::aggregators { @@ -27,8 +20,7 @@ struct AggregatorInput { std::size_t checkpoint_size = 32 * 1024 * 1024; std::string index_dir; bool force_rebuild = false; - std::size_t chunk_size_mb = 64; - std::size_t batch_size_mb = 4; + std::size_t parallelism = 0; // 0 = use all available threads std::size_t event_batch_size = 10000; AggregatorInput& with_directory(const std::string& dir); @@ -36,27 +28,13 @@ struct AggregatorInput { AggregatorInput& with_checkpoint_size(std::size_t sz); AggregatorInput& with_index_dir(const std::string& dir); AggregatorInput& with_force_rebuild(bool force); - AggregatorInput& with_chunk_size_mb(std::size_t mb); - AggregatorInput& with_batch_size_mb(std::size_t mb); + AggregatorInput& with_parallelism(std::size_t n); AggregatorInput& with_event_batch_size(std::size_t sz); }; -enum class AggregationBatchType { EVENT, PROFILE, SYSTEM }; - -struct AggregationBatch { - std::vector> entries; - AggregationBatchType batch_type = AggregationBatchType::EVENT; - std::size_t total_events_processed = 0; - std::size_t total_files_processed = 0; - std::size_t total_bytes_processed = 0; - -#ifdef DFTRACER_UTILS_ENABLE_ARROW - common::arrow::ArrowExportResult to_arrow() const; -#endif -}; - class AggregatorUtility - : public StreamingUtility { + : public StreamingUtility { public: coro::AsyncGenerator process( const AggregatorInput& input) override; diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregators.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregators.h index 9e4a43c7..cbf14485 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregators.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregators.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATORS_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h index 40b69386..7b957024 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h @@ -15,13 +15,13 @@ namespace dftracer::utils::utilities::composites::dft::aggregators { struct AssociationResolverInput { - EventAggregatorUtilityOutput aggregations; + EventAggregatorOutput aggregations; std::vector> trackers; AggregationConfig config; }; struct AssociationResolverOutput { - EventAggregatorUtilityOutput aggregations; + EventAggregatorOutput aggregations; std::unordered_set root_pids; std::uint64_t trace_duration = 0; BoundaryTimeRangesMap boundary_ranges; @@ -36,10 +36,9 @@ class AssociationResolverUtility const AssociationResolverInput& input) override; private: - void compute_trace_metadata( - const AssociationTracker& tracker, - const EventAggregatorUtilityOutput& aggregations, - AssociationResolverOutput& output); + void compute_trace_metadata(const AssociationTracker& tracker, + const EventAggregatorOutput& aggregations, + AssociationResolverOutput& output); }; } // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h b/include/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h index ddee89ae..00b4444f 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h @@ -1,20 +1,18 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_ASSOCIATION_TRACKER_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_ASSOCIATION_TRACKER_H -#include #include +#include #include #include +#include #include #include #include namespace dftracer::utils::utilities::composites::dft::aggregators { -// Import JsonValue from common json namespace -using dftracer::utils::utilities::common::json::JsonValue; - struct BoundaryInterval { std::string name; std::string value; @@ -34,7 +32,9 @@ class AssociationTracker { public: AssociationTracker() = default; - void extract_from_event(const JsonValue& json, const JsonValue& args, + void extract_from_event(std::string_view name, std::uint64_t pid, + std::uint64_t ts, std::uint64_t dur, + const ArgsMap& args, const AggregationConfig& config); void finalize(); @@ -51,6 +51,9 @@ class AssociationTracker { std::unordered_set get_root_pids() const; void merge(const AssociationTracker& other); + + std::string serialize() const; + static AssociationTracker deserialize(std::string_view data); }; } // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h index 0691c45e..b6eb202e 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h @@ -30,14 +30,14 @@ using dftracer::utils::utilities::composites::dft::DFTracerEvent; struct ChunkAggregatorInput { std::string file_path; std::string index_path; - std::size_t start_byte; - std::size_t end_byte; - std::size_t start_line; - std::size_t end_line; + std::size_t start_byte = 0; + std::size_t end_byte = 0; + std::size_t start_line = 0; + std::size_t end_line = 0; AggregationConfig config; std::optional query; - std::size_t checkpoint_size; - int chunk_index; + std::size_t checkpoint_size = 0; + int chunk_index = 0; std::size_t batch_size = 4 * 1024 * 1024; @@ -87,17 +87,6 @@ struct ChunkAggregatorInput { class ChunkAggregatorUtility : public utilities::Utility { - private: - std::uint64_t compute_time_bucket(std::uint64_t timestamp, - std::uint64_t duration, - const AggregationConfig& config) const; - - AggregationKey build_key(const DFTracerEvent& ev, - const AggregationConfig& config) const; - - void update_entry(const DFTracerEvent& ev, const AggregationConfig& config, - AggregationMap& aggregations, const AggregationKey& key); - public: ChunkAggregatorUtility() = default; diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h b/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h new file mode 100644 index 00000000..077de51a --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h @@ -0,0 +1,132 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::rocksdb { +class RocksDatabase; +} + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +class EventAggregator { + public: + EventAggregator(); + + EventAggregator(std::shared_ptr db, + std::uint32_t config_hash); + + void merge_chunk(ChunkAggregationOutput&& chunk_output); + + EventAggregatorOutput finalize(); + + using ScanCallback = std::function; + std::size_t scan(ScanCallback callback) const; + std::size_t scan_shard_range(std::uint16_t shard_begin, + std::uint16_t shard_end, + ScanCallback callback) const; + + /// Type-erased raw scan. Use the templated overload below for zero- + /// allocation calls. + using RawScanCallbackFn = bool (*)(void* ctx, std::string_view key_bytes, + std::string_view value_bytes); + std::size_t scan_shard_range_raw_fn(std::uint16_t shard_begin, + std::uint16_t shard_end, + RawScanCallbackFn fn, void* ctx) const; + + /// Template wrapper: forwards any callable `(sv, sv) -> bool` into the + /// raw scan with zero heap allocations. The adapter lambda is a captureless + /// `+[]` so it decays to a plain function pointer. + template + std::size_t scan_shard_range_raw(std::uint16_t shard_begin, + std::uint16_t shard_end, + F&& callback) const { + auto adapter = + +[](void* ctx, std::string_view k, std::string_view v) -> bool { + return (*static_cast*>(ctx))(k, v); + }; + return scan_shard_range_raw_fn(shard_begin, shard_end, adapter, + static_cast(&callback)); + } + + /// Move trackers out without materializing the full aggregation map. + std::vector> take_trackers(); + + /// Merge fresh trackers with any persisted tracker from the DB, + /// persist the result, and return the merged tracker. + std::unique_ptr build_global_tracker(); + + struct ObservedColumns { + std::vector extra_key_ids; + std::vector custom_metric_names; + }; + ObservedColumns observed_columns(); + void add_observed_extra_key(const std::string& key); + void add_observed_custom_metric(const std::string& name); + + std::size_t total_events() const { return total_events_.load(); } + std::size_t total_bytes() const { return total_bytes_.load(); } + std::size_t total_files() const { return unique_files_.size(); } + + void update_time_bounds(std::uint64_t time_bucket); + std::uint64_t min_time_bucket() const; + std::uint64_t max_time_bucket() const; + + struct TimeBoundsResult { + std::uint64_t min_time_bucket; + std::uint64_t max_time_bucket; + bool valid; + }; + TimeBoundsResult query_time_bounds() const; + + bool is_rocksdb_mode() const { return rocksdb_mode_; } + std::shared_ptr db() const { return db_; } + std::uint32_t config_hash() const { return config_hash_; } + + static std::shared_ptr open_with_merge_operator( + const std::string& index_path); + + /// Read-only variant for multi-process concurrent scan (e.g. MPI ranks + /// covering disjoint shard-prefix ranges of a shared unified index). + /// Multi-process RocksDB writes are forbidden; read-only opens do not + /// hold the exclusive LOCK, so N ranks can open the same DB at once. + static std::shared_ptr + open_read_only_with_merge_operator(const std::string& index_path); + + private: + void merge_chunk_memory(ChunkAggregationOutput&& chunk_output); + void merge_chunk_rocksdb(ChunkAggregationOutput&& chunk_output); + + bool rocksdb_mode_ = false; + + // In-memory state + EventAggregatorOutput state_; + std::unordered_set unique_files_; + + // RocksDB state + std::shared_ptr db_; + std::uint32_t config_hash_ = 0; + std::atomic total_events_{0}; + std::atomic total_bytes_{0}; + std::vector> trackers_; + + std::set observed_extra_key_ids_; + std::set observed_custom_metric_names_; + + std::atomic min_time_bucket_{UINT64_MAX}; + std::atomic max_time_bucket_{0}; +}; + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h deleted file mode 100644 index ab14e8d9..00000000 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_UTILITY_H -#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_UTILITY_H - -#include -#include -#include - -#include -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::aggregators { - -struct EventAggregatorUtilityInput { - std::vector chunk_outputs; -}; - -class EventAggregatorUtility - : public utilities::Utility { - public: - coro::CoroTask process( - const EventAggregatorUtilityInput& input) override; - - void merge_chunk(ChunkAggregationOutput&& chunk_output); - EventAggregatorUtilityOutput finalize(); - - private: - EventAggregatorUtilityOutput state_; - std::unordered_set unique_files_; -}; - -} // namespace dftracer::utils::utilities::composites::dft::aggregators - -#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_UTILITY_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h index d56ce0e5..a95d9bc3 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h @@ -1,50 +1,68 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_PERFETTO_TRACE_WRITER_UTILITY_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_PERFETTO_TRACE_WRITER_UTILITY_H +#include +#include #include #include #include #include +#include #include +#include #include -#include +#include #include namespace dftracer::utils::utilities::composites::dft::aggregators { +class EventAggregator; + enum class PerfettoEventFormat { COUNTER, ASYNC, REGULAR }; struct PerfettoTraceWriterInput { std::string output_path; - AssociationResolverOutput resolver_output; + const EventAggregator* aggregator = nullptr; + const AssociationTracker* tracker = nullptr; + const AggregationConfig* agg_config = nullptr; + std::unique_ptr owned_tracker; + std::unordered_set root_pids; + std::uint64_t trace_duration = 0; + BoundaryTimeRangesMap boundary_ranges; bool compute_statistics = true; bool compute_percentiles = false; std::vector percentiles; bool compress = false; int compression_level = 6; PerfettoEventFormat format = PerfettoEventFormat::COUNTER; + /// Workers add their per-shard key count here if non-null. + std::atomic* keys_written = nullptr; + /// Concatenate shards into `output_path` and unlink them on SHARDED + /// layouts (typically NFS). Callers that read shards directly leave false. + bool merge_on_sharded = false; + /// Total shard-prefix range (half-open) this invocation is responsible + /// for. Defaults cover the whole key space. MPI drivers set a disjoint + /// range per rank so N ranks collectively cover `[0, AGG_KEY_NUM_SHARDS)` + /// without overlap. Local coroutine workers within a single process + /// further subdivide this range. + std::uint16_t shard_begin = 0; + std::uint16_t shard_end = 0; // 0 means "use AGG_KEY_NUM_SHARDS" + /// Emit the JSON array prologue (`[\n` + trace_metadata + root_process + /// markers) to this invocation's output. MPI drivers set this on rank 0 + /// only so concatenated rank outputs produce exactly one array open. + bool emit_header = true; + /// Emit the JSON array epilogue (`]`). MPI drivers set this on the last + /// rank only. + bool emit_footer = true; }; using PerfettoTraceWriterOutput = bool; class PerfettoTraceWriterUtility - : public utilities::Utility { - private: - std::uint64_t generate_synthetic_tid(const AggregationKey& key) const; - void append_json_string(std::string& buffer, std::string_view str) const; - void append_double(std::string& buffer, double value) const; - void append_metric_stats(std::string& buffer, const MetricStats& stats, - std::uint64_t count, bool compute_statistics, - bool compute_percentiles, - const std::vector& percentiles) const; - void append_event_args(std::string& buffer, const AggregationKey& key, - const AggregationMetrics& metrics, - bool compute_statistics, bool compute_percentiles, - const std::vector& percentiles, - std::uint64_t real_tid = 0) const; - + : public utilities::Utility< + PerfettoTraceWriterInput, PerfettoTraceWriterOutput, + utilities::tags::Parallelizable, utilities::tags::NeedsContext> { public: coro::CoroTask process( const PerfettoTraceWriterInput& input) override; diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h new file mode 100644 index 00000000..32d406b1 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h @@ -0,0 +1,206 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_H + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +using common::statistics::DDSketch; + +struct FloatMetricStats { + std::uint64_t count = 0; + double total = 0.0; + double min = std::numeric_limits::max(); + double max = std::numeric_limits::lowest(); + double mean = 0.0; + double m2 = 0.0; + std::unique_ptr sketch; + double sketch_accuracy_ = 0.01; + + explicit FloatMetricStats(double relative_accuracy = 0.01) + : sketch_accuracy_(relative_accuracy) {} + + FloatMetricStats(const FloatMetricStats& other) + : count(other.count), + total(other.total), + min(other.min), + max(other.max), + mean(other.mean), + m2(other.m2), + sketch(other.sketch ? std::make_unique(*other.sketch) + : nullptr), + sketch_accuracy_(other.sketch_accuracy_) {} + + FloatMetricStats& operator=(const FloatMetricStats& other) { + if (this != &other) { + count = other.count; + total = other.total; + min = other.min; + max = other.max; + mean = other.mean; + m2 = other.m2; + sketch = other.sketch ? std::make_unique(*other.sketch) + : nullptr; + sketch_accuracy_ = other.sketch_accuracy_; + } + return *this; + } + + FloatMetricStats(FloatMetricStats&&) = default; + FloatMetricStats& operator=(FloatMetricStats&&) = default; + + void update(double value, bool compute_percentiles = false) { + count++; + total += value; + if (value < min) min = value; + if (value > max) max = value; + + // Welford's online mean/variance + double delta = value - mean; + mean += delta / static_cast(count); + double delta2 = value - mean; + m2 += delta * delta2; + + if (compute_percentiles) { + if (!sketch) { + sketch = std::make_unique(sketch_accuracy_); + } + sketch->add(value); + } + } + + void merge_from(const FloatMetricStats& other) { + if (other.count == 0) return; + if (count == 0) { + *this = other; + return; + } + + std::uint64_t new_count = count + other.count; + double delta = other.mean - mean; + double new_mean = mean + delta * static_cast(other.count) / + static_cast(new_count); + double new_m2 = m2 + other.m2 + + delta * delta * static_cast(count) * + static_cast(other.count) / + static_cast(new_count); + + count = new_count; + total += other.total; + if (other.min < min) min = other.min; + if (other.max > max) max = other.max; + mean = new_mean; + m2 = new_m2; + + if (other.sketch) { + if (!sketch) { + sketch = std::make_unique(*other.sketch); + } else { + sketch->merge(*other.sketch); + } + } + } + + double get_stddev() const { + if (count < 2) return 0.0; + return std::sqrt(m2 / static_cast(count - 1)); + } +}; + +using FloatMetricsMap = + std::unordered_map; + +struct SystemAggregationMetrics { + std::uint64_t count = 0; + + // Timestamp bounds for this bucket + std::uint64_t ts = std::numeric_limits::max(); + std::uint64_t te = 0; + + // Named system metrics (aggregated as mean per bucket) + std::unique_ptr metrics; + + double sketch_accuracy = 0.01; + + explicit SystemAggregationMetrics(double relative_accuracy = 0.01) + : sketch_accuracy(relative_accuracy) {} + + SystemAggregationMetrics(const SystemAggregationMetrics& other) + : count(other.count), + ts(other.ts), + te(other.te), + metrics(other.metrics + ? std::make_unique(*other.metrics) + : nullptr), + sketch_accuracy(other.sketch_accuracy) {} + + SystemAggregationMetrics& operator=(const SystemAggregationMetrics& other) { + if (this != &other) { + count = other.count; + ts = other.ts; + te = other.te; + metrics = other.metrics + ? std::make_unique(*other.metrics) + : nullptr; + sketch_accuracy = other.sketch_accuracy; + } + return *this; + } + + SystemAggregationMetrics(SystemAggregationMetrics&&) = default; + SystemAggregationMetrics& operator=(SystemAggregationMetrics&&) = default; + + void update_metric(std::string_view name, double value, + bool compute_percentiles = false) { + if (!metrics) { + metrics = std::make_unique(); + } + auto it = metrics->find(name); + if (it == metrics->end()) { + it = metrics + ->emplace(std::string(name), + FloatMetricStats(sketch_accuracy)) + .first; + } + it->second.update(value, compute_percentiles); + } + + void update_timestamp(std::uint64_t event_ts) { + if (event_ts < ts) ts = event_ts; + if (event_ts > te) te = event_ts; + } + + void merge_from(const SystemAggregationMetrics& other) { + count += other.count; + if (other.ts < ts) ts = other.ts; + if (other.te > te) te = other.te; + + if (other.metrics) { + if (!metrics) { + metrics = std::make_unique(); + } + for (const auto& [name, stats] : *other.metrics) { + auto it = metrics->find(name); + if (it == metrics->end()) { + it = metrics + ->emplace(name, FloatMetricStats(sketch_accuracy)) + .first; + } + it->second.merge_from(stats); + } + } + } +}; + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h new file mode 100644 index 00000000..1fee3c47 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h @@ -0,0 +1,26 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_MERGE_OPERATOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_MERGE_OPERATOR_H + +#include + +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +class SystemMetricsMergeOperator : public ::rocksdb::MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMerge(const ::rocksdb::Slice& key, + const ::rocksdb::Slice& left_operand, + const ::rocksdb::Slice& right_operand, + std::string* new_value, + ::rocksdb::Logger* logger) const override; + + const char* Name() const override { return "SystemMetricsMergeOperator"; } +}; + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_MERGE_OPERATOR_H diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h new file mode 100644 index 00000000..db5a16f8 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h @@ -0,0 +1,37 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_SERIALIZATION_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_SERIALIZATION_H + +#include + +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +// System metrics key: [hhash:var][time_bucket:varint] +// Simpler key than regular aggregation since system metrics are host-level + +struct SystemMetricKey { + std::string hhash; + std::uint64_t time_bucket = 0; +}; + +void serialize_system_key_into(std::string& out, std::string_view hhash, + std::uint64_t time_bucket); +std::string serialize_system_key(std::string_view hhash, + std::uint64_t time_bucket); + +struct DeserializedSystemKey { + SystemMetricKey key; +}; +DeserializedSystemKey deserialize_system_key(std::string_view data); + +void serialize_system_value_into(std::string& out, + const SystemAggregationMetrics& metrics); +std::string serialize_system_value(const SystemAggregationMetrics& metrics); +SystemAggregationMetrics deserialize_system_value(std::string_view data); + +} // namespace dftracer::utils::utilities::composites::dft::aggregators + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_SERIALIZATION_H diff --git a/include/dftracer/utils/utilities/composites/dft/args_map.h b/include/dftracer/utils/utilities/composites/dft/args_map.h new file mode 100644 index 00000000..7bd92dfc --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/args_map.h @@ -0,0 +1,216 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_ARGS_MAP_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_ARGS_MAP_H + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft { + +using ArgsValue = std::variant; + +class ArgsValueProxy { + const ArgsValue* val_; + + public: + explicit ArgsValueProxy(const ArgsValue* v = nullptr) : val_(v) {} + + bool exists() const { + return val_ != nullptr && + !std::holds_alternative(*val_); + } + explicit operator bool() const { return exists(); } + bool is_null() const { return !exists(); } + bool is_string() const { + return val_ && std::holds_alternative(*val_); + } + bool is_uint() const { + return val_ && std::holds_alternative(*val_); + } + bool is_int() const { + return val_ && std::holds_alternative(*val_); + } + bool is_number() const { + return val_ && (std::holds_alternative(*val_) || + std::holds_alternative(*val_) || + std::holds_alternative(*val_)); + } + bool is_bool() const { return val_ && std::holds_alternative(*val_); } + bool is_object() const { return false; } + bool is_array() const { return false; } + + template + T get(const T& default_val = T{}) const { + if (!val_ || std::holds_alternative(*val_)) + return default_val; + + if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + return default_val; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + return default_val; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) + return std::string_view(*p); + return default_val; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return p->c_str(); + return default_val; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + if (auto* p = std::get_if(val_)) { + if (*p >= 0) return static_cast(*p); + } + if (auto* p = std::get_if(val_)) { + if (*p >= 0 && *p <= static_cast( + std::numeric_limits::max())) + return static_cast(*p); + } + return default_val; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + if (auto* p = std::get_if(val_)) { + if (*p <= + static_cast(std::numeric_limits::max())) + return static_cast(*p); + } + if (auto* p = std::get_if(val_)) { + return static_cast(*p); + } + return default_val; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + if (auto* p = std::get_if(val_)) + return static_cast(*p); + if (auto* p = std::get_if(val_)) + return static_cast(*p); + return default_val; + } else if constexpr (std::is_same_v) { + return static_cast( + get(static_cast(default_val))); + } else if constexpr (std::is_integral_v && std::is_unsigned_v) { + return static_cast( + get(static_cast(default_val))); + } else if constexpr (std::is_integral_v && std::is_signed_v) { + return static_cast( + get(static_cast(default_val))); + } else { + static_assert(!sizeof(T), + "Unsupported type for ArgsValueProxy::get()"); + } + } + + template + std::optional get_optional() const { + if (!val_ || std::holds_alternative(*val_)) + return std::nullopt; + + if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + return std::nullopt; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) + return std::string_view(*p); + return std::nullopt; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + if (auto* p = std::get_if(val_)) { + if (*p >= 0) return static_cast(*p); + } + return std::nullopt; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + if (auto* p = std::get_if(val_)) { + if (*p <= + static_cast(std::numeric_limits::max())) + return static_cast(*p); + } + return std::nullopt; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + if (auto* p = std::get_if(val_)) + return static_cast(*p); + if (auto* p = std::get_if(val_)) + return static_cast(*p); + return std::nullopt; + } else if constexpr (std::is_same_v) { + if (auto* p = std::get_if(val_)) return *p; + return std::nullopt; + } else { + static_assert( + !sizeof(T), + "Unsupported type for ArgsValueProxy::get_optional()"); + } + } +}; + +class ArgsMap { + using Map = dftracer::utils::StringViewMap; + Map data_; + bool valid_ = false; + + static dftracer::utils::StringIntern& key_intern() { + static dftracer::utils::StringIntern instance; + return instance; + } + + public: + ArgsMap() = default; + + bool exists() const { return valid_; } + explicit operator bool() const { return valid_; } + + void set_valid(bool v) { valid_ = v; } + + void insert(std::string_view key, ArgsValue value) { + auto interned = std::string(key_intern().intern(key)); + data_.emplace(std::move(interned), std::move(value)); + } + + void clear() { + data_.clear(); + valid_ = false; + } + + ArgsValueProxy operator[](std::string_view key) const { + if (!valid_) return ArgsValueProxy{}; + auto it = data_.find(key); + return it != data_.end() ? ArgsValueProxy{&it->second} + : ArgsValueProxy{}; + } + + ArgsValueProxy operator[](const char* key) const { + return (*this)[std::string_view(key)]; + } + + ArgsValueProxy operator[](const std::string& key) const { + return (*this)[std::string_view(key)]; + } + + ArgsValueProxy at(const char* key) const { return (*this)[key]; } + ArgsValueProxy at(const std::string& key) const { return (*this)[key]; } + ArgsValueProxy at(std::string_view key) const { return (*this)[key]; } + + template + void for_each_member(Fn&& fn) const { + if (!valid_) return; + for (const auto& [k, v] : data_) { + fn(std::string_view(k), ArgsValueProxy{&v}); + } + } + + const Map& raw() const { return data_; } +}; + +} // namespace dftracer::utils::utilities::composites::dft + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_ARGS_MAP_H diff --git a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h index 87acb4fa..be5feb3c 100644 --- a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h +++ b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h @@ -1,6 +1,8 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_COMPARATOR_COMPARISON_CONFIG_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_COMPARATOR_COMPARISON_CONFIG_H +#include + #include #include #include @@ -87,8 +89,10 @@ struct ComparisonConfig { std::size_t executor_threads = 0; /// Checkpoint size for index building (0 = default). std::size_t checkpoint_size = 0; - /// Directory for `.dftindex` stores. - std::string index_dir; + /// Directory for baseline `.dftindex` store (empty = co-located). + std::string baseline_index_dir; + /// Directory for variant `.dftindex` store (empty = co-located). + std::string variant_index_dir; /// Force rebuild of existing indexes. bool force_rebuild = false; @@ -109,7 +113,7 @@ struct ComparisonConfig { void resolve(); private: - static bool parse_node(void* yyjson_val_ptr, ComparisonNode& node, + static bool parse_node(simdjson::dom::element val, ComparisonNode& node, std::string& error); void resolve_node(ComparisonNode& node, const std::string& parent_query, const std::vector& parent_metrics, diff --git a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_result.h b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_result.h index 446d077d..a9d3c480 100644 --- a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_result.h +++ b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_result.h @@ -1,10 +1,10 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_COMPARATOR_COMPARISON_RESULT_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_COMPARATOR_COMPARISON_RESULT_H +#include #include #include #include - #ifdef DFTRACER_UTILS_ENABLE_ARROW #include #endif diff --git a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h index 61c55f83..dbef2dc1 100644 --- a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h @@ -10,15 +10,15 @@ namespace dftracer::utils::utilities::composites::dft::comparator { -using aggregators::EventAggregatorUtilityOutput; +using aggregators::EventAggregatorOutput; /// Paired baseline/variant aggregation outputs for a single comparison /// node. struct ComparisonVisitorPair { /// Aggregation output for the baseline run. - EventAggregatorUtilityOutput baseline; + EventAggregatorOutput baseline; /// Aggregation output for the variant run. - EventAggregatorUtilityOutput variant; + EventAggregatorOutput variant; /// Resolved config node for this visitor. ComparisonNode node; }; diff --git a/include/dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h b/include/dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h new file mode 100644 index 00000000..88cf08c0 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h @@ -0,0 +1,326 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_DISPATCHER_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_DISPATCHER_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft { + +class DftEventDispatcher : public indexer::IndexVisitor { + public: + using VisitorList = std::vector>; + + static constexpr std::size_t FLUSH_THRESHOLD = 4 * 1024 * 1024; // 4MB + + explicit DftEventDispatcher(VisitorList visitors, bool force_serial = false) + : visitors_(std::move(visitors)), force_serial_(force_serial) { + for (auto& v : visitors_) { + if (v.get().needs_args_map()) { + needs_args_map_ = true; + break; + } + } + } + + void begin(std::size_t num_checkpoints) override { + for (auto& v : visitors_) { + v.get().begin(num_checkpoints); + } + } + + coro::CoroTask on_checkpoint(std::size_t checkpoint_idx) override { + co_await flush_batch(pending_checkpoint_idx_); + line_number_ = 0; + for (auto& v : visitors_) { + v.get().on_checkpoint(checkpoint_idx); + } + } + + coro::CoroTask on_chunk(const char* data, std::size_t len, + std::size_t checkpoint_idx) override { + if (len == 0) co_return; + ensure_accum(); + accum_->append(data, len); + pending_checkpoint_idx_ = checkpoint_idx; + if (accum_->size() >= FLUSH_THRESHOLD) { + co_await flush_batch(checkpoint_idx); + } + } + + coro::CoroTask flush() override { + co_await flush_batch(pending_checkpoint_idx_); + } + + bool wants_drain() const noexcept override { + for (const auto& v : visitors_) { + if (v.get().wants_drain()) return true; + } + return false; + } + + coro::CoroTask drain_pending() override { + for (auto& v : visitors_) { + if (v.get().wants_drain()) { + co_await v.get().drain_pending(); + } + } + } + + void on_line(std::string_view line, indexer::SharedLineBuffer buffer, + std::size_t checkpoint_idx) override { + std::size_t ln = line_number_++; + + if (line.empty()) return; + + auto result = parser_.parse(line.data(), line.size()); + if (result.error()) return; + + auto root = result.value_unsafe(); + if (!root.is_object()) return; + + common::json::JsonValue json(root); + DFTracerEvent ev; + simdjson::dom::element args_dom{}; + bool has_args = false; + bool ok = false; + if (needs_args_map_) { + ok = DFTracerEvent::parse(json, ev); + if (ok) { + auto args_r = root["args"]; + if (!args_r.error() && args_r.value_unsafe().is_object()) { + args_dom = args_r.value_unsafe(); + has_args = true; + } + } + } else { + ok = DFTracerEvent::parse_scalars(root, ev, args_dom, has_args); + } + if (ok) { + EventRecord record{ev, json, line, buffer, checkpoint_idx, + ln, args_dom, has_args}; + for (auto& v : visitors_) { + v.get().on_event(record); + } + } + } + + void finalize(indexer::IndexDatabaseWriterContext& writer, + int file_id) override { + (void)writer; + (void)file_id; + } + + private: + void ensure_accum() { + if (!accum_) { + accum_ = std::make_shared(); + accum_->reserve(FLUSH_THRESHOLD + FLUSH_THRESHOLD / 4); + if (!partial_doc_.empty()) { + accum_->append(partial_doc_.data(), partial_doc_.size()); + partial_doc_.clear(); + } + } + } + + coro::CoroTask flush_batch(std::size_t checkpoint_idx) { + if (!accum_ || accum_->empty()) co_return; + + std::size_t total = accum_->size(); + strip_array_delimiters(accum_->data(), total); + accum_->resize(total + simdjson::SIMDJSON_PADDING, '\0'); + auto chunk_buffer = std::move(accum_); + accum_ = nullptr; + + std::size_t partial = 0; + Executor* exec = Executor::current(); + std::size_t num_slices = preferred_slice_count(exec, total); + + if (!force_serial_ && num_slices >= 2 && + all_visitors_parallelizable()) { + partial = co_await parallel_flush(*exec, chunk_buffer, total, + checkpoint_idx, num_slices); + } else { + partial = serial_flush(chunk_buffer, total, checkpoint_idx); + } + + if (partial > 0 && partial <= total) { + partial_doc_.assign(chunk_buffer->data() + total - partial, + chunk_buffer->data() + total); + } + } + + bool all_visitors_parallelizable() { + if (visitor_parallel_clones_cached_) return parallelizable_cached_; + visitor_parallel_clones_cached_ = true; + for (auto& v : visitors_) { + if (!v.get().create_parallel_slice()) { + parallelizable_cached_ = false; + return false; + } + } + parallelizable_cached_ = true; + return true; + } + + std::size_t preferred_slice_count(Executor* exec, std::size_t total) const { + if (!exec) return 1; + const std::size_t MIN_SLICE_BYTES = 256 * 1024; + std::size_t cap = exec->get_num_threads(); + if (cap < 2) return 1; + if (cap > 8) cap = 8; + std::size_t by_size = total / MIN_SLICE_BYTES; + if (by_size < 2) return 1; + return std::min(by_size, cap); + } + + std::size_t serial_flush(std::shared_ptr chunk_buffer, + std::size_t total, std::size_t checkpoint_idx) { + return parse_buffer(parser_, chunk_buffer, total, checkpoint_idx, + line_number_, needs_args_map_, + [this](const EventRecord& record) { + for (auto& v : visitors_) + v.get().on_event(record); + }); + } + + coro::CoroTask parallel_flush( + Executor& /*exec*/, std::shared_ptr chunk_buffer, + std::size_t total, std::size_t checkpoint_idx, std::size_t num_slices) { + std::vector> ranges; + ranges.reserve(num_slices); + const char* data = chunk_buffer->data(); + std::size_t cursor = 0; + std::size_t partial_tail = 0; + for (std::size_t i = 0; i < num_slices; ++i) { + std::size_t target = + (i + 1 == num_slices) ? total : (i + 1) * (total / num_slices); + std::size_t end = target; + if (i + 1 == num_slices) { + end = total; + } else { + while (end < total && data[end] != '\n') ++end; + if (end < total) ++end; + } + if (end <= cursor) continue; + ranges.emplace_back(cursor, end); + cursor = end; + } + if (cursor < total) { + partial_tail = total - cursor; + } + + std::vector>> slice_vis( + ranges.size()); + for (std::size_t s = 0; s < ranges.size(); ++s) { + slice_vis[s].reserve(visitors_.size()); + for (auto& v : visitors_) { + slice_vis[s].push_back(v.get().create_parallel_slice()); + } + } + + std::vector> slice_tasks; + slice_tasks.reserve(ranges.size()); + for (std::size_t s = 0; s < ranges.size(); ++s) { + slice_tasks.push_back( + make_slice_task(chunk_buffer, ranges[s].first, ranges[s].second, + checkpoint_idx, slice_vis[s])); + } + auto slice_truncs = co_await coro::when_all(std::move(slice_tasks)); + + if (!slice_truncs.empty()) { + std::size_t last_trunc = slice_truncs.back(); + std::size_t last_len = ranges.back().second - ranges.back().first; + if (last_trunc > 0 && last_trunc <= last_len) { + partial_tail = last_trunc; + } + } + + std::vector running_offsets(visitors_.size(), 0); + for (std::size_t s = 0; s < ranges.size(); ++s) { + for (std::size_t i = 0; i < visitors_.size(); ++i) { + if (!slice_vis[s][i]) continue; + slice_vis[s][i]->set_line_offset(running_offsets[i]); + running_offsets[i] += slice_vis[s][i]->parallel_event_count(); + visitors_[i].get().merge_parallel_slice(*slice_vis[s][i]); + } + } + + co_return partial_tail; + } + + static coro::CoroTask make_slice_task( + std::shared_ptr chunk_buffer, std::size_t start, + std::size_t end, std::size_t checkpoint_idx, + std::vector>& slice_vis) { + std::size_t truncated = 0; + try { + simdjson::dom::parser local_parser; + simdjson::dom::document_stream stream; + auto err = local_parser + .parse_many(chunk_buffer->data() + start, + end - start, end - start) + .get(stream); + if (!err) { + std::size_t slice_ln = 0; + for (auto it = stream.begin(); it != stream.end(); ++it) { + if ((*it).error()) continue; + auto root = (*it).value_unsafe(); + if (!root.is_object()) continue; + common::json::JsonValue json(root); + DFTracerEvent ev; + simdjson::dom::element args_dom{}; + bool has_args = false; + if (DFTracerEvent::parse_scalars(root, ev, args_dom, + has_args)) { + std::string_view src = it.source(); + EventRecord record{ + ev, json, src, + chunk_buffer, checkpoint_idx, slice_ln, + args_dom, has_args}; + ++slice_ln; + for (auto& v : slice_vis) { + if (v) v->on_event(record); + } + } + } + truncated = stream.truncated_bytes(); + } + } catch (...) { + } + co_return truncated; + } + + static void strip_array_delimiters(char* buf, std::size_t len) { + ::dftracer::utils::utilities::composites::dft::strip_array_delimiters( + buf, len); + } + + VisitorList visitors_; + bool needs_args_map_ = false; + bool force_serial_ = false; + simdjson::dom::parser parser_; + std::size_t line_number_ = 0; + std::shared_ptr accum_; + std::vector partial_doc_; + std::size_t pending_checkpoint_idx_ = 0; + bool visitor_parallel_clones_cached_ = false; + bool parallelizable_cached_ = false; +}; + +} // namespace dftracer::utils::utilities::composites::dft + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_DISPATCHER_H diff --git a/include/dftracer/utils/utilities/composites/dft/dft_event_visitor.h b/include/dftracer/utils/utilities/composites/dft/dft_event_visitor.h new file mode 100644 index 00000000..ddc2d2f9 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/dft_event_visitor.h @@ -0,0 +1,70 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_VISITOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_VISITOR_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft { + +struct EventRecord { + const DFTracerEvent& ev; + const common::json::JsonValue& json; + std::string_view line; + indexer::SharedLineBuffer line_buffer; // keeps line data alive + std::size_t checkpoint_idx; + std::size_t line_number; + simdjson::dom::element args_dom{}; + bool has_args{false}; +}; + +class DftEventVisitor { + public: + virtual ~DftEventVisitor() = default; + + virtual void begin(std::size_t num_checkpoints) = 0; + + virtual void on_checkpoint(std::size_t checkpoint_idx) = 0; + + virtual void on_event(const EventRecord& record) = 0; + + // Hint that the visitor has accumulated work that should be drained. + // Cheap (no allocation/co_await): the dispatcher polls this after every + // on_event and only co_awaits drain_pending() when true. + virtual bool wants_drain() const noexcept { return false; } + + // Drain any accumulated work via async operations (e.g. channel send). + // Suspends the calling coroutine when downstream is full, providing + // real backpressure without blocking an executor thread. + virtual coro::CoroTask drain_pending() { co_return; } + + virtual coro::CoroTask on_file_complete() { co_return; } + + virtual std::unique_ptr create_parallel_slice() const { + return nullptr; + } + virtual void merge_parallel_slice(DftEventVisitor& /*slice*/) {} + + /// In parallel-flush mode, slices receive events with slice-local line + /// numbers (0..N-1). The dispatcher calls this on the slice before + /// merge_parallel_slice with the cumulative successful-event count of + /// prior slices, so the slice can renumber its stored line indices. + virtual void set_line_offset(std::size_t /*offset*/) {} + + /// Successful events processed by this slice. Used by the dispatcher to + /// propagate line offsets across slices in byte order. + virtual std::size_t parallel_event_count() const { return 0; } + + virtual bool needs_args_map() const { return false; } +}; + +} // namespace dftracer::utils::utilities::composites::dft + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_VISITOR_H diff --git a/include/dftracer/utils/utilities/composites/dft/event.h b/include/dftracer/utils/utilities/composites/dft/event.h index 86ec78b6..b9754c45 100644 --- a/include/dftracer/utils/utilities/composites/dft/event.h +++ b/include/dftracer/utils/utilities/composites/dft/event.h @@ -1,52 +1,30 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_EVENT_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_EVENT_H -/** - * @file event.h - * @brief Common DFTracer event representation and parser. - * - * Provides DFTracerEvent, a lightweight struct capturing the core fields - * of a Chrome Tracing / DFTracer event. All string fields are string_view - * into the yyjson document (valid only while the doc lives). - */ - #include +#include +#include #include +#include #include namespace dftracer::utils::utilities::composites::dft { using common::json::JsonValue; -/** - * Parsed DFTracer trace event. - * - * All string_view fields point into the yyjson document memory and are - * only valid while the document is alive. - * - * Typical usage: - * DFTracerEvent ev; - * if (DFTracerEvent::parse(json, ev)) { - * if (ev.is_complete()) { ... } - * } - */ struct DFTracerEvent { - // Core Chrome Tracing fields std::uint64_t id = 0; std::string_view name; std::string_view cat; - std::string_view ph; // "X" (complete), "M" (metadata), "B"/"E", etc. + std::string_view ph; std::uint64_t pid = 0; std::uint64_t tid = 0; std::uint64_t ts = 0; std::uint64_t dur = 0; - // Args subtree, lazy, just the yyjson_val* pointer. - // Access via args["field"] or pass to evaluator/bloom. - JsonValue args; + ArgsMap args; - // Convenience predicates bool is_metadata() const { return ph == "M"; } bool is_counter() const { return ph == "C"; } bool is_profile() const { return ph == "C" && cat != "sys"; } @@ -55,11 +33,6 @@ struct DFTracerEvent { bool is_complete() const { return ph == "X"; } bool has_id() const { return id != 0; } - /** - * Parse from a JsonValue (wrapping a yyjson_val* object root). - * Returns true if the JSON is a valid object with at least "ph". - * Fields that are absent get their default (0 / empty). - */ static bool parse(const JsonValue& json, DFTracerEvent& out) { auto ph_val = json["ph"]; if (!ph_val.exists()) return false; @@ -88,18 +61,172 @@ struct DFTracerEvent { if (dur_val.exists()) out.dur = dur_val.get(); auto args_val = json["args"]; - if (args_val.exists()) out.args = args_val; + if (args_val.exists() && args_val.is_object()) { + out.args.set_valid(true); + args_val.for_each_member([&](std::string_view k, JsonValue v) { + if (v.is_string()) { + out.args.insert(k, std::string(v.get())); + } else if (v.is_uint()) { + out.args.insert(k, v.get()); + } else if (v.is_int()) { + out.args.insert(k, v.get()); + } else if (v.is_number()) { + out.args.insert(k, v.get()); + } else if (v.is_bool()) { + out.args.insert(k, v.get()); + } + }); + } return true; } - /** - * Parse from a raw yyjson_val* root (for call sites that don't use - * JsonValue). Same semantics as the JsonValue overload. - */ - static bool parse(yyjson_val* root, DFTracerEvent& out) { - if (!root || !yyjson_is_obj(root)) return false; - return parse(JsonValue(root), out); + static bool parse_scalars(simdjson::dom::element root, DFTracerEvent& out, + simdjson::dom::element& out_args, + bool& out_has_args) { + out_has_args = false; + if (!root.is_object()) return false; + + bool has_ph = false; + for (auto field : root.get_object()) { + std::string_view key = field.key; + simdjson::dom::element val = field.value; + switch (key.size()) { + case 2: + if (key == "ph") { + if (val.is_string()) { + out.ph = val.get_string().value_unsafe(); + has_ph = true; + } + } else if (key == "id") { + if (val.is_uint64()) + out.id = val.get_uint64().value_unsafe(); + } else if (key == "ts") { + if (val.is_uint64()) + out.ts = val.get_uint64().value_unsafe(); + } + break; + case 3: + if (key == "pid") { + if (val.is_uint64()) + out.pid = val.get_uint64().value_unsafe(); + } else if (key == "tid") { + if (val.is_uint64()) + out.tid = val.get_uint64().value_unsafe(); + } else if (key == "cat") { + if (val.is_string()) + out.cat = val.get_string().value_unsafe(); + } else if (key == "dur") { + if (val.is_uint64()) + out.dur = val.get_uint64().value_unsafe(); + } + break; + case 4: + if (key == "name") { + if (val.is_string()) + out.name = val.get_string().value_unsafe(); + } else if (key == "args") { + if (val.is_object()) { + out_args = val; + out_has_args = true; + } + } + break; + default: + break; + } + } + return has_ph; + } + + static bool parse_ondemand(common::json::JsonParser& parser, + DFTracerEvent& out) { + bool has_ph = false; + parser.for_each_field([&](std::string_view key, + simdjson::ondemand::value val) { + if (key == "ph") { + auto r = val.get_string(); + if (!r.error()) { + out.ph = r.value_unsafe(); + has_ph = true; + } + } else if (key == "id") { + auto r = val.get_uint64(); + if (!r.error()) out.id = r.value_unsafe(); + } else if (key == "name") { + auto r = val.get_string(); + if (!r.error()) out.name = r.value_unsafe(); + } else if (key == "cat") { + auto r = val.get_string(); + if (!r.error()) out.cat = r.value_unsafe(); + } else if (key == "pid") { + auto r = val.get_uint64(); + if (!r.error()) out.pid = r.value_unsafe(); + } else if (key == "tid") { + auto r = val.get_uint64(); + if (!r.error()) out.tid = r.value_unsafe(); + } else if (key == "ts") { + auto r = val.get_uint64(); + if (!r.error()) out.ts = r.value_unsafe(); + } else if (key == "dur") { + auto r = val.get_uint64(); + if (!r.error()) out.dur = r.value_unsafe(); + } else if (key == "args") { + auto obj = val.get_object(); + if (!obj.error()) { + out.args.set_valid(true); + for (auto field : obj.value_unsafe()) { + if (field.error()) continue; + auto fkey = field.unescaped_key(); + if (fkey.error()) continue; + auto fval = field.value(); + if (fval.error()) continue; + + auto type = fval.type(); + if (type.error()) continue; + + switch (type.value_unsafe()) { + case simdjson::ondemand::json_type::string: { + auto r = fval.get_string(); + if (!r.error()) + out.args.insert( + fkey.value_unsafe(), + std::string(r.value_unsafe())); + break; + } + case simdjson::ondemand::json_type::number: { + auto ri = fval.get_int64(); + if (!ri.error()) { + auto v = ri.value_unsafe(); + if (v >= 0) + out.args.insert( + fkey.value_unsafe(), + static_cast(v)); + else + out.args.insert(fkey.value_unsafe(), v); + } else { + auto rd = fval.get_double(); + if (!rd.error()) + out.args.insert(fkey.value_unsafe(), + rd.value_unsafe()); + } + break; + } + case simdjson::ondemand::json_type::boolean: { + auto r = fval.get_bool(); + if (!r.error()) + out.args.insert(fkey.value_unsafe(), + r.value_unsafe()); + break; + } + default: + break; + } + } + } + } + }); + return has_ph; } }; diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h b/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h index f6a75b7c..9ffd1c9a 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h @@ -3,6 +3,7 @@ #include +#include #include #include #include @@ -11,11 +12,25 @@ namespace dftracer::utils::utilities::composites::dft::indexing { /** - * @brief Bloom filter for approximate set membership testing. + * @brief Split block Bloom filter for approximate set membership testing. * - * Uses Kirsch-Mitzenmacher optimization: k hash functions derived from - * 2 base hash values (std::hash with different seeds). Supports - * serialization to/from binary blobs for RocksDB storage. + * Implements the split block Bloom filter from the Apache Parquet spec: + * 256-bit blocks of 8 x uint32 words; each insert/query touches exactly + * one block (one cache line) and sets/tests one bit in each of the 8 + * words via a fixed SALT array. Block selection uses Lemire's reduction + * on h1; in-block masks use h2 multiplied by SALT. + * + * References: + * - Apple, J. "Split block Bloom filters." arXiv:2101.01719 (2021). + * - Putze, F., Sanders, P., Singler, J. "Cache-, hash-, and space- + * efficient bloom filters." ACM JEA 14, Article 4 (2009). + * - Apache Parquet Bloom filter spec: + * https://github.com/apache/parquet-format/blob/master/BloomFilter.md + * + * Differs from canonical Parquet: + * - Underlying hash is FNV1a + SplitMix64 finisher (not xxhash64). + * - Custom 12-byte LE header (num_hashes, num_entries, num_bits) instead + * of Thrift; num_hashes is unused at insert/test (vestigial). * * Serialization format (self-describing): * [4 bytes: num_hashes (uint32_t LE)] @@ -58,6 +73,11 @@ class BloomFilter { std::size_t num_hashes_; std::size_t num_entries_; mutable hash::Fnv1aHasherUtility hasher_; + + static constexpr std::size_t LAST_VALUE_CAP = 64; + std::array last_value_buf_{}; + std::size_t last_value_size_ = 0; + bool last_value_valid_ = false; }; } // namespace dftracer::utils::utilities::composites::dft::indexing diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h index 7807ed77..103816bc 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h @@ -6,7 +6,6 @@ #include #include #include -#include #include namespace dftracer::utils::utilities::composites::dft::indexing { @@ -21,16 +20,64 @@ struct ChunkDimensionStats { std::string value_type = "string"; ///< "string", "uint", "int", or "double". - /// Value -> count map. Nullopt when compressed size exceeds cap. - /// Uses transparent hash to allow string_view lookups without allocation. - std::optional> - value_counts; + std::optional> value_counts; + + // Skips the hash lookup when the same value is observed back-to-back. + // Not copied/moved: a copy would point into the original's nodes. + const std::string* last_key_ = nullptr; + std::uint64_t* last_counter_ = nullptr; + + ChunkDimensionStats() = default; + ChunkDimensionStats(const ChunkDimensionStats& other) + : dimension(other.dimension), + distinct_count(other.distinct_count), + min_value(other.min_value), + max_value(other.max_value), + value_type(other.value_type), + value_counts(other.value_counts) {} + ChunkDimensionStats(ChunkDimensionStats&& other) noexcept + : dimension(std::move(other.dimension)), + distinct_count(other.distinct_count), + min_value(std::move(other.min_value)), + max_value(std::move(other.max_value)), + value_type(std::move(other.value_type)), + value_counts(std::move(other.value_counts)) { + other.last_key_ = nullptr; + other.last_counter_ = nullptr; + } + ChunkDimensionStats& operator=(const ChunkDimensionStats& other) { + if (this != &other) { + dimension = other.dimension; + distinct_count = other.distinct_count; + min_value = other.min_value; + max_value = other.max_value; + value_type = other.value_type; + value_counts = other.value_counts; + last_key_ = nullptr; + last_counter_ = nullptr; + } + return *this; + } + ChunkDimensionStats& operator=(ChunkDimensionStats&& other) noexcept { + if (this != &other) { + dimension = std::move(other.dimension); + distinct_count = other.distinct_count; + min_value = std::move(other.min_value); + max_value = std::move(other.max_value); + value_type = std::move(other.value_type); + value_counts = std::move(other.value_counts); + last_key_ = nullptr; + last_counter_ = nullptr; + other.last_key_ = nullptr; + other.last_counter_ = nullptr; + } + return *this; + } /// Record a value observation. Updates min/max, distinct_count, /// value_counts. void observe(std::string_view value); + void observe_range_only(std::uint64_t value); /// Serialize value_counts to binary format: /// [u32 LE num_entries] [u16 LE key_len, key bytes, u64 LE count]* @@ -41,11 +88,11 @@ struct ChunkDimensionStats { std::optional> compress_value_counts( std::size_t cap_bytes = 4096) const; - static std::unordered_map + static dftracer::utils::StringViewMap deserialize_value_counts(const std::uint8_t* data, std::size_t len); /// Decompress zlib-compressed value_counts, then deserialize. - static std::unordered_map + static dftracer::utils::StringViewMap decompress_value_counts(const std::uint8_t* data, std::size_t len); }; @@ -57,8 +104,23 @@ struct ChunkDimensionStatsResult { std::string min_value; std::string max_value; std::string value_type; - // NULL in DB → nullopt here - std::optional> value_counts; + mutable std::optional> + value_counts; + // Raw compressed value_counts. Populated when value_counts is left + // un-decoded so callers can lazily decode on first access. + mutable std::vector compressed_value_counts; + + bool has_value_counts_payload() const { + return value_counts.has_value() || !compressed_value_counts.empty(); + } + + void ensure_value_counts_decoded() const { + if (value_counts || compressed_value_counts.empty()) return; + value_counts = ChunkDimensionStats::decompress_value_counts( + compressed_value_counts.data(), compressed_value_counts.size()); + compressed_value_counts.clear(); + compressed_value_counts.shrink_to_fit(); + } }; } // namespace dftracer::utils::utilities::composites::dft::indexing diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h index a49ebe64..c38b64ba 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h @@ -58,13 +58,10 @@ struct ChunkIndexerConfig { }; // Hash resolution maps (collected once per file from metadata events) -using HashResolveMap = - std::shared_ptr>; +using HashResolveMap = std::shared_ptr>; // Hash resolution entry: dimension -> {hash -> resolved_value} -using HashResolutions = - std::unordered_map>; +using HashResolutions = StringViewMap>; // Tracks which dimensions have been indexed per chunk for incremental updates struct IndexedDimensions { diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h index cb0f0378..07acd8cc 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -15,11 +16,18 @@ namespace dftracer::utils::utilities::composites::dft::indexing { using common::query::Query; /// Input for chunk pruning: index path, file path, query, optional cache. +/// +/// If `external_db` is non-null the utility reuses that handle instead of +/// opening the RocksDB at `index_path` itself. This lets callers that +/// prune many files against the same directory-level index amortize the +/// (expensive) RocksDB open cost to once per batch rather than once per +/// file. struct ChunkPrunerInput { std::string index_path; ///< Path to the `.dftindex` store. std::string file_path; ///< Path to trace file. Query query; ///< Query to evaluate for pruning. BloomFilterCache* cache = nullptr; ///< Optional bloom filter cache. + indexer::IndexDatabase* external_db = nullptr; ///< Reused DB handle. }; /// Result of chunk pruning. @@ -31,6 +39,26 @@ struct ChunkPrunerOutput { bool success = false; ///< True if pruning completed without error. }; +/// Input for batched pruning across many files that share the same +/// `.dftindex` store. Allows a single RocksDB scan per column family to +/// populate per-file pruner contexts instead of one scan per file. +struct ChunkPrunerBatchItem { + std::string file_path; + Query query; +}; + +struct ChunkPrunerBatchInput { + std::string index_path; + std::vector items; + BloomFilterCache* cache = nullptr; + indexer::IndexDatabase* external_db = nullptr; +}; + +struct ChunkPrunerBatchOutput { + std::vector outputs; ///< Parallel to items[]. + bool success = false; +}; + /// Three-tier chunk pruner: dictionary → min/max range → bloom filter. /// Walks the Query AST recursively (AND=intersect, OR=union, NOT=complement). class ChunkPrunerUtility @@ -41,6 +69,10 @@ class ChunkPrunerUtility coro::CoroTask process( const ChunkPrunerInput& input) override; + + /// Batch-prune many files against the same index with shared RocksDB + /// range scans for dim_stats / chunk_statistics. + ChunkPrunerBatchOutput process_batch(const ChunkPrunerBatchInput& input); }; } // namespace dftracer::utils::utilities::composites::dft::indexing diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h index 37abe99c..203d1e6b 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h @@ -1,8 +1,10 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_CHUNK_STATISTICS_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_CHUNK_STATISTICS_H +#include #include #include +#include #include #include @@ -18,14 +20,14 @@ namespace dftracer::utils::utilities::composites::dft::indexing { * * Tracks event counts by category/name/pid:tid, timestamp ranges, * and duration statistics using Welford's online algorithm for variance. - * Map fields serialize to JSON text via yyjson for storage in the + * Map fields serialize to JSON text for storage in the * shared `.dftindex` database. */ struct ChunkStatistics { std::uint64_t total_events = 0; - std::unordered_map category_counts; - std::unordered_map name_counts; - std::unordered_map pid_tid_counts; + StringViewMap category_counts; + StringViewMap name_counts; + StringViewMap pid_tid_counts; std::uint64_t min_timestamp_us = std::numeric_limits::max(); std::uint64_t max_timestamp_us = 0; @@ -37,13 +39,12 @@ struct ChunkStatistics { common::statistics::DDSketch duration_sketch{0.01}; common::statistics::Log2Histogram duration_histogram; - std::unordered_map - name_duration_sketches; - std::unordered_map - name_duration_histograms; - std::unordered_map name_duration_sums; - std::unordered_map name_duration_sum_sqs; - std::unordered_map name_category; + common::statistics::TimestampHistogram timestamp_histogram; + StringViewMap name_duration_sketches; + StringViewMap name_duration_histograms; + StringViewMap name_duration_sums; + StringViewMap name_duration_sum_sqs; + StringViewMap name_category; void update_from_event(std::string_view name, std::string_view cat, std::uint64_t pid, std::uint64_t tid, @@ -62,13 +63,12 @@ struct ChunkStatistics { /// Serialize per-name DDSketches to a single binary blob. std::vector serialize_name_duration_sketches() const; - static std::unordered_map parse_string_map_json( + static StringViewMap parse_string_map_json( const std::string& json); - static std::unordered_map parse_double_map_json( - const std::string& json); - static std::unordered_map + static StringViewMap parse_double_map_json(const std::string& json); + static StringViewMap parse_histogram_map_json(const std::string& json); - static std::unordered_map + static StringViewMap deserialize_name_duration_sketches(const std::uint8_t* data, std::size_t len); }; diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h b/include/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h new file mode 100644 index 00000000..14806cce --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h @@ -0,0 +1,86 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_INDEX_RESOLVER_UTILITY_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_INDEX_RESOLVER_UTILITY_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::indexing { + +struct ResolvedFile { + std::size_t file_index = 0; + std::string file_path; + std::int32_t file_id = -1; + indexer::IndexFileEntryCapability capabilities = + indexer::IndexFileEntryCapability::NONE; +}; + +struct FileWorkItem { + std::size_t file_index = 0; + std::string file_path; + std::int32_t file_id = -1; +}; + +struct ResolverInput { + std::string directory; + std::string index_dir; + std::vector files; + + bool require_checkpoints = true; + bool require_bloom = false; + bool require_manifest = false; + bool require_aggregation = false; + + // Full config for computing hash with stored time_interval + std::optional aggregation_config; +}; + +struct ResolverResult { + std::vector all_files; + std::vector all_file_sizes; + std::string index_path; + + std::vector needs_checkpoint; + std::vector needs_bloom; + std::vector needs_manifest; + std::vector needs_aggregation; + + std::vector cached; + + // Aggregation augmentation info (when cached aggregation exists with + // different time_interval) + bool needs_augmentation = false; + std::uint64_t stored_time_interval_us = 0; // Time interval in cached data + + std::size_t total_needs_work() const { + return needs_checkpoint.size() + needs_bloom.size() + + needs_manifest.size() + needs_aggregation.size(); + } + + std::size_t total_cached() const { return cached.size(); } +}; + +class IndexResolverUtility + : public utilities::Utility { + private: + filesystem::PatternDirectoryScannerUtility scanner_; + + public: + coro::CoroTask process(const ResolverInput& input) override; +}; + +} // namespace dftracer::utils::utilities::composites::dft::indexing + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_INDEX_RESOLVER_UTILITY_H diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h b/include/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h new file mode 100644 index 00000000..1317e98e --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h @@ -0,0 +1,43 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_RESOLVE_AND_BUILD_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_RESOLVE_AND_BUILD_H + +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::indexing { + +struct ResolveAndBuildInput { + std::string directory; + std::vector files; + std::string index_dir; + + std::size_t checkpoint_size = 32 * 1024 * 1024; // 32MB default + std::size_t parallelism = 0; + bool force_rebuild = false; + + bool require_checkpoints = true; + bool require_bloom = false; + bool require_manifest = false; + bool require_aggregation = false; + + std::optional aggregation_config; +}; + +// Consolidates the common resolve -> build -> re-resolve pattern. +// Returns ResolverResult with: +// - all_files: discovered files +// - index_path: path to shared index +// - cached: fully resolved files ready for use +// - needs_checkpoint: files that failed to index (for direct scan fallback) +coro::CoroTask resolve_and_build_index( + CoroScope* scope, ResolveAndBuildInput input); + +} // namespace dftracer::utils::utilities::composites::dft::indexing + +#endif diff --git a/include/dftracer/utils/utilities/composites/dft/internal/utils.h b/include/dftracer/utils/utilities/composites/dft/internal/utils.h index d2c6db5d..fe287abd 100644 --- a/include/dftracer/utils/utilities/composites/dft/internal/utils.h +++ b/include/dftracer/utils/utilities/composites/dft/internal/utils.h @@ -11,17 +11,19 @@ namespace dftracer::utils::utilities::composites::dft::internal { bool is_data_transfer_op(std::string_view cat, std::string_view name); /** - * @brief Determine the root-local RocksDB index path for a given data file. + * @brief Determine the root-local RocksDB index path for a given input path. * * When a custom index directory is provided, the index root is * `/.dftindex`. Otherwise, the index root is placed alongside the - * data file as `/.dftindex`. + * input path: + * - file path: `/.dftindex` + * - directory path: `/.dftindex` * - * @param file_path Path to the data file (e.g., "data/trace.pfw.gz") + * @param path Path to a data file or directory * @param index_dir Optional custom directory for the index root. * @return Path to the owning `.dftindex` directory. */ -std::string determine_index_path(const std::string& file_path, +std::string determine_index_path(const std::string& path, const std::string& index_dir = ""); /** diff --git a/include/dftracer/utils/utilities/composites/dft/parse_inflated.h b/include/dftracer/utils/utilities/composites/dft/parse_inflated.h new file mode 100644 index 00000000..c3057863 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/parse_inflated.h @@ -0,0 +1,108 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_PARSE_INFLATED_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_PARSE_INFLATED_H + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft { + +// Replace "[\n" / "]\n" delimiter lines with spaces in-place. Mirrors the +// behaviour used by DftEventDispatcher so parse_buffer can consume the +// stripped buffer with parse_many. Public so the direct-organize path can +// reuse it on inflated buffers it accumulates itself. +inline void strip_array_delimiters(char* buf, std::size_t len) { + for (std::size_t i = 0; i < len;) { + std::size_t line_start = i; + std::size_t line_end = i; + while (line_end < len && buf[line_end] != '\n') ++line_end; + + bool has_bracket = false; + for (std::size_t j = line_start; j < line_end; ++j) { + char c = buf[j]; + if (c == ' ' || c == '\t' || c == '\r') continue; + if ((c == '[' || c == ']') && !has_bracket) { + has_bracket = true; + } else { + has_bracket = false; + break; + } + } + + if (has_bracket) { + for (std::size_t j = line_start; j < line_end; ++j) buf[j] = ' '; + } + + i = (line_end < len) ? line_end + 1 : len; + } +} + +// Iterate parsed dftracer events from a single inflated buffer. The buffer +// is assumed to hold concatenated NDJSON-ish events plus the simdjson +// padding required by parse_many; callers responsible for stripping any +// "[" / "]" delimiter lines (see strip_array_delimiters above). +// +// `chunk_buffer` is held shared so the EventRecord's string_view can outlive +// the loop body if a visitor stashes it. `len` excludes the simdjson +// padding tail. `line_number` is incremented for each successfully parsed +// event before being copied into the EventRecord. `needs_args_map` follows +// the existing dispatcher contract (when any consumer requires the args +// JSON object materialized, take the slower DFTracerEvent::parse path). +// +// Returns the number of trailing bytes parse_many reported as truncated, so +// the caller can carry them over to the next buffer. +template +std::size_t parse_buffer(simdjson::dom::parser& parser, + std::shared_ptr chunk_buffer, + std::size_t len, std::size_t checkpoint_idx, + std::size_t& line_number, bool needs_args_map, + Cb&& cb) { + if (!chunk_buffer || len == 0) return 0; + + simdjson::dom::document_stream stream; + auto err = parser.parse_many(chunk_buffer->data(), len, len).get(stream); + if (err) return 0; + + for (auto it = stream.begin(); it != stream.end(); ++it) { + if ((*it).error()) continue; + auto root = (*it).value_unsafe(); + if (!root.is_object()) continue; + common::json::JsonValue json(root); + DFTracerEvent ev; + simdjson::dom::element args_dom{}; + bool has_args = false; + bool ok = false; + if (needs_args_map) { + ok = DFTracerEvent::parse(json, ev); + if (ok) { + auto args_r = root["args"]; + if (!args_r.error() && args_r.value_unsafe().is_object()) { + args_dom = args_r.value_unsafe(); + has_args = true; + } + } + } else { + ok = DFTracerEvent::parse_scalars(root, ev, args_dom, has_args); + } + if (!ok) continue; + + std::size_t ln = line_number++; + std::string_view src = it.source(); + EventRecord record{ev, json, src, chunk_buffer, checkpoint_idx, + ln, args_dom, has_args}; + cb(record); + } + + std::size_t truncated = stream.truncated_bytes(); + return (truncated > 0 && truncated <= len) ? truncated : 0; +} + +} // namespace dftracer::utils::utilities::composites::dft + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_PARSE_INFLATED_H diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h b/include/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h new file mode 100644 index 00000000..c7172231 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h @@ -0,0 +1,69 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_GROUP_WRITER_TASK_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_GROUP_WRITER_TASK_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::reorganize { + +struct GroupWriterConfig { + std::string group_name; + std::string group_query; + std::string output_dir; + std::size_t chunk_size_bytes = 256 * 1024 * 1024; + bool compress = true; + int compression_level = -1; // Z_DEFAULT_COMPRESSION (level 6) + std::shared_ptr>> input_channel; + const std::vector* source_files = nullptr; + bool build_output_index = true; + + std::string index_dir; + bool with_aggregation = false; + double agg_time_interval_us = 5'000'000.0; + std::vector bloom_dimensions; + indexing::ChunkIndexerConfig bloom_config; + std::string staging_root; + std::shared_ptr> + artifacts_queue; + std::shared_ptr> batch_counter; +}; + +struct ChunkMemberLayout { + std::string path; + std::vector members; +}; + +struct GroupWriterResult { + std::string group_name; + std::size_t events_written = 0; + std::size_t bytes_written = 0; + std::size_t chunks_created = 0; + std::vector output_files; + /// Per-chunk-file gzip-member layout captured directly from the writer. + /// Lets downstream indexing skip the post-write gzip header re-scan. + std::vector chunk_layouts; + bool indexed_inline = false; + bool success = false; + std::string error_message; +}; + +coro::CoroTask run_group_writer(CoroScope* scope, + GroupWriterConfig config); + +} // namespace dftracer::utils::utilities::composites::dft::reorganize + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_GROUP_WRITER_TASK_H diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h b/include/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h new file mode 100644 index 00000000..7912cb91 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h @@ -0,0 +1,38 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_MANIFEST_EXTRACTOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_MANIFEST_EXTRACTOR_H + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::reorganize { + +struct ManifestExtractorConfig { + std::string file_path; + std::string index_path; + std::size_t source_file_idx = 0; + std::vector groups; + std::vector>>> + group_channels; + std::size_t batch_size = 1024; +}; + +struct ManifestExtractorResult { + std::size_t events_extracted = 0; + std::size_t events_unmatched = 0; + bool success = false; + std::string error_message; +}; + +coro::CoroTask extract_from_manifest( + ManifestExtractorConfig config); + +} // namespace dftracer::utils::utilities::composites::dft::reorganize + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_MANIFEST_EXTRACTOR_H diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h b/include/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h new file mode 100644 index 00000000..ab6f73a2 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h @@ -0,0 +1,106 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_ORGANIZE_VISITOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_ORGANIZE_VISITOR_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::reorganize { + +struct LineRecord { + std::uint32_t offset; + std::uint32_t length; + std::size_t source_file_idx; + std::size_t checkpoint_idx; + std::size_t source_line_number; +}; + +struct LineBatch { + std::string bytes; + std::vector lines; + + void reserve(std::size_t n) { + lines.reserve(n); + bytes.reserve(n * 256); + } + std::size_t size() const { return lines.size(); } + bool empty() const { return lines.empty(); } + void clear() { + lines.clear(); + bytes.clear(); + } + + std::string_view line_view(std::size_t i) const { + const auto& r = lines[i]; + return std::string_view(bytes.data() + r.offset, r.length); + } + + void append_line(std::string_view line, std::size_t source_file_idx, + std::size_t checkpoint_idx, + std::size_t source_line_number) { + auto offset = static_cast(bytes.size()); + bytes.append(line.data(), line.size()); + lines.push_back(LineRecord{ + .offset = offset, + .length = static_cast(line.size()), + .source_file_idx = source_file_idx, + .checkpoint_idx = checkpoint_idx, + .source_line_number = source_line_number, + }); + } +}; + +struct OrganizeVisitorConfig { + std::vector groups; + std::vector>>> + group_channels; + std::size_t source_file_idx = 0; + std::size_t batch_size = 1024; +}; + +class OrganizeVisitor : public DftEventVisitor { + public: + explicit OrganizeVisitor(OrganizeVisitorConfig config); + + void begin(std::size_t num_checkpoints) override; + void on_checkpoint(std::size_t checkpoint_idx) override; + void on_event(const EventRecord& record) override; + bool wants_drain() const noexcept override; + coro::CoroTask drain_pending() override; + coro::CoroTask on_file_complete() override; + + std::unique_ptr create_parallel_slice() const override; + void merge_parallel_slice(DftEventVisitor& slice) override; + + std::size_t events_routed() const { return events_routed_; } + std::size_t events_unmatched() const { return events_unmatched_; } + + private: + std::size_t evaluate_event(const DFTracerEvent& ev, + const common::json::JsonValue& json); + + OrganizeVisitorConfig config_; + std::vector> parsed_queries_; + std::vector pending_batches_; + /// Full LineBatches queued by `merge_parallel_slice` (move-only, no + /// byte copy). Drained alongside `pending_batches_` on the next + /// `drain_pending` / `on_file_complete` call. + std::vector>> drain_queue_; + std::size_t current_checkpoint_ = 0; + std::size_t events_routed_ = 0; + std::size_t events_unmatched_ = 0; +}; + +} // namespace dftracer::utils::utilities::composites::dft::reorganize + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_ORGANIZE_VISITOR_H diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h b/include/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h new file mode 100644 index 00000000..bd6fc2e6 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h @@ -0,0 +1,54 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_RECONSTRUCTOR_UTILITY_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_RECONSTRUCTOR_UTILITY_H + +#include +#include +#include + +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::reorganize { + +struct ReconstructorInput { + std::string input_dir; + std::string output_dir; + std::size_t checkpoint_size = constants::indexer::DEFAULT_CHECKPOINT_SIZE; + std::size_t parallelism = 0; + bool compress = true; + + ReconstructorInput& with_input_dir(std::string dir); + ReconstructorInput& with_output_dir(std::string dir); + ReconstructorInput& with_checkpoint_size(std::size_t sz); + ReconstructorInput& with_parallelism(std::size_t n); + ReconstructorInput& with_compress(bool c); +}; + +struct ReconstructedFileInfo { + std::string original_path; + std::string output_path; + std::size_t events_written = 0; + std::size_t bytes_written = 0; +}; + +struct ReconstructorResult { + std::vector files; + std::size_t total_events = 0; + std::size_t total_bytes = 0; + std::size_t total_segments = 0; + bool success = false; + std::string error_message; +}; + +class ReconstructorUtility + : public utilities::Utility { + public: + coro::CoroTask process( + const ReconstructorInput& input) override; +}; + +} // namespace dftracer::utils::utilities::composites::dft::reorganize + +#endif diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h b/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h index 92ea2f20..9eb920a0 100644 --- a/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h +++ b/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h @@ -1,6 +1,7 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_REORGANIZATION_PLANNER_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_REORGANIZATION_PLANNER_H +#include #include #include @@ -47,7 +48,8 @@ struct ReorganizationPlannerInput { }; class ReorganizationPlannerUtility - : public utilities::Utility { + : public utilities::Utility { public: ReorganizationPlannerUtility() = default; diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h b/include/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h index 6163de8d..f981aea0 100644 --- a/include/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h +++ b/include/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h @@ -1,6 +1,7 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_STATISTICS_DETAILED_STATISTICS_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_STATISTICS_DETAILED_STATISTICS_H +#include #include #include @@ -51,14 +52,14 @@ struct DetailedStatistics { DistributionStats duration; // Per-group-key duration statistics - std::unordered_map grouped_duration; + StringViewMap grouped_duration; // Per-group-key I/O metrics (only for groups with I/O events) - std::unordered_map grouped_io; + StringViewMap grouped_io; // Maps group key -> category string (e.g. "POSIX", "dlio_benchmark") // Used by the display layer to split events by category. - std::unordered_map group_key_category; + StringViewMap group_key_category; // Scan progress std::uint64_t events_scanned = 0; diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h b/include/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h new file mode 100644 index 00000000..5e258dcb --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h @@ -0,0 +1,156 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_STATISTICS_SHARED_INDEX_STATISTICS_READER_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_STATISTICS_SHARED_INDEX_STATISTICS_READER_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::statistics { + +struct EntrySnapshot { + std::size_t file_index; + int file_id; + std::string file_path; +}; + +struct SharedIndexBatchRows { + std::unordered_map num_chunks; + std::unordered_map + fallback_merged_stats; + std::unordered_map merged_stats; + std::vector entries_snapshot; +}; + +inline SharedIndexBatchRows query_shared_index_batch( + std::string index_path, std::vector entries, + StatisticsQueryType query_type) { + SharedIndexBatchRows rows; + std::vector file_ids; + file_ids.reserve(entries.size()); + rows.entries_snapshot.reserve(entries.size()); + for (auto& entry : entries) { + file_ids.push_back(entry.file_id); + rows.entries_snapshot.push_back(EntrySnapshot{ + entry.file_index, entry.file_id, std::move(entry.file_path)}); + } + + utilities::indexer::IndexDatabase idx_db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto scalar_rows = idx_db.query_file_scalar_stats_batch(file_ids); + std::unordered_map merge_targets; + merge_targets.reserve(file_ids.size()); + + std::vector missing_ids; + missing_ids.reserve(file_ids.size()); + rows.num_chunks.reserve(scalar_rows.size()); + rows.merged_stats.reserve(scalar_rows.size()); + for (auto& [file_id, merged] : scalar_rows) { + rows.num_chunks.emplace(file_id, merged.num_chunks); + auto merged_entry = + rows.merged_stats.emplace(file_id, std::move(merged.stats)); + merge_targets.emplace(file_id, &merged_entry.first->second); + } + for (const auto file_id : file_ids) { + if (rows.num_chunks.find(file_id) == rows.num_chunks.end()) { + missing_ids.push_back(file_id); + } + } + const bool needs_categories = + query_type == StatisticsQueryType::SUMMARY || + query_type == StatisticsQueryType::CATEGORIES || + query_type == StatisticsQueryType::TOP_N_CATEGORIES; + const bool needs_names = query_type == StatisticsQueryType::NAMES || + query_type == StatisticsQueryType::TOP_N_NAMES; + const bool needs_pid_tids = query_type == StatisticsQueryType::SUMMARY || + query_type == StatisticsQueryType::PID_TIDS; + + if (needs_categories) { + idx_db.merge_file_category_counts_batch_into(file_ids, merge_targets); + } + if (needs_names) { + idx_db.merge_file_name_counts_batch_into(file_ids, merge_targets); + } + if (needs_pid_tids) { + idx_db.merge_file_pid_tid_counts_batch_into(file_ids, merge_targets); + } + if (!missing_ids.empty()) { + rows.fallback_merged_stats = + idx_db.query_merged_statistics_batch(missing_ids); + } + return rows; +} + +class SharedIndexStatisticsReader { + public: + SharedIndexStatisticsReader() = default; + + coro::CoroTask query( + std::string index_path, std::vector entries, + StatisticsQueryType query_type) const { + co_return query_shared_index_batch(std::move(index_path), + std::move(entries), query_type); + } + + template + static void process_batch_results(SharedIndexBatchRows& batch_rows, + Callback& callback) { + for (const auto& [file_index, file_id, file_path] : + batch_rows.entries_snapshot) { + const auto chunks_it = batch_rows.num_chunks.find(file_id); + if (chunks_it != batch_rows.num_chunks.end()) { + TraceStatistics stats; + stats.file_path = file_path; + stats.num_chunks = chunks_it->second; + stats.merged = std::move(batch_rows.merged_stats[file_id]); + stats.success = stats.num_chunks > 0; + if (!stats.success) { + stats.error_message = + "No chunk statistics in index for " + file_path; + } + callback(file_index, std::move(stats)); + continue; + } + + auto merged_it = batch_rows.fallback_merged_stats.find(file_id); + callback(file_index, + build_trace_statistics_from_index( + file_path, + merged_it == batch_rows.fallback_merged_stats.end() + ? nullptr + : &merged_it->second)); + } + } + + private: + static TraceStatistics build_trace_statistics_from_index( + const std::string& file_path, + utilities::indexer::MergedStatisticsResult* merged) { + TraceStatistics result; + result.file_path = file_path; + + if (merged == nullptr || merged->num_chunks == 0) { + result.success = false; + result.error_message = + "No chunk statistics in index for " + file_path; + return result; + } + + result.num_chunks = merged->num_chunks; + result.merged = std::move(merged->stats); + result.success = true; + return result; + } +}; + +} // namespace dftracer::utils::utilities::composites::dft::statistics + +#endif diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h index 7141ba23..c844c49f 100644 --- a/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h @@ -6,6 +6,7 @@ #include #include +#include namespace dftracer::utils::utilities::composites::dft::statistics { @@ -15,6 +16,11 @@ struct StatisticsAggregatorInput { std::string index_dir; }; +struct StatisticsAggregatorBatchInput { + std::vector file_paths; + std::string index_path; +}; + class StatisticsAggregatorUtility : public utilities::Utility { @@ -23,6 +29,9 @@ class StatisticsAggregatorUtility coro::CoroTask process( const StatisticsAggregatorInput& input) override; + + coro::CoroTask> process_batch( + const StatisticsAggregatorBatchInput& input); }; } // namespace dftracer::utils::utilities::composites::dft::statistics diff --git a/include/dftracer/utils/utilities/composites/dft/views/view_reader_utility.h b/include/dftracer/utils/utilities/composites/dft/views/view_reader_utility.h index 7709f9e8..46c44ac1 100644 --- a/include/dftracer/utils/utilities/composites/dft/views/view_reader_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/views/view_reader_utility.h @@ -1,14 +1,15 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VIEWS_VIEW_READER_UTILITY_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VIEWS_VIEW_READER_UTILITY_H +#include #include #include #include #include #include - #ifdef DFTRACER_UTILS_ENABLE_ARROW #include +#include #endif #include @@ -56,6 +57,8 @@ struct ViewReaderBatch { #ifdef DFTRACER_UTILS_ENABLE_ARROW common::arrow::ArrowExportResult to_arrow() const; + common::arrow::ArrowExportResult to_arrow( + common::arrow::RecordBatchBuilder& builder) const; #endif }; diff --git a/include/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h b/include/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h new file mode 100644 index 00000000..b5d96146 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h @@ -0,0 +1,148 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_BLOOM_VISITOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_BLOOM_VISITOR_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer { +class IndexBatchSink; +class IndexDatabaseWriterContext; +} // namespace dftracer::utils::utilities::indexer + +namespace dftracer::utils::utilities::composites::dft::visitors { + +class BloomVisitor : public DftEventVisitor { + public: + using HashResolutions = indexing::HashResolutions; + using ChunkStatistics = indexing::ChunkStatistics; + using ChunkDimensionStats = indexing::ChunkDimensionStats; + using ChunkIndexerConfig = indexing::ChunkIndexerConfig; + + /// Fixed bloom filter slots. Indices match DEFAULT_BLOOM_DIMENSIONS + /// order: name, cat, pid, tid, hhash, fhash, shash. + enum FixedBloom : std::uint8_t { + BF_NAME = 0, + BF_CAT, + BF_PID, + BF_TID, + BF_HHASH, + BF_FHASH, + BF_SHASH, + BF_COUNT + }; + + /// Fixed dimension_stats slots. Superset of bloom dims plus pid_tid, + /// ts, dur (which are observed for range stats but not hashed). + enum FixedDim : std::uint8_t { + FD_NAME = 0, + FD_CAT, + FD_PID, + FD_TID, + FD_PID_TID, + FD_HHASH, + FD_FHASH, + FD_SHASH, + FD_TS, + FD_DUR, + FD_COUNT + }; + + struct ChunkState { + std::array fixed_blooms; + std::array fixed_dim_stats; + std::vector extra_blooms; + std::vector extra_dim_stats; + ChunkStatistics statistics; + HashResolutions hash_resolutions; + std::size_t events_processed = 0; + + ChunkState(); + }; + + BloomVisitor(ChunkIndexerConfig config, + std::vector dimensions); + BloomVisitor(const BloomVisitor&) = delete; + BloomVisitor& operator=(const BloomVisitor&) = delete; + BloomVisitor(BloomVisitor&&) noexcept = default; + BloomVisitor& operator=(BloomVisitor&&) noexcept = default; + + void begin(std::size_t num_checkpoints) override; + void on_checkpoint(std::size_t checkpoint_idx) override; + void on_event(const EventRecord& record) override; + + std::unique_ptr create_parallel_slice() const override; + void merge_parallel_slice(DftEventVisitor& slice) override; + + void finalize(indexer::IndexDatabaseWriterContext& writer, int file_id); + /// Emit bloom / stats / dimension records plus name dictionary/postings + /// to a sink backend. Skips ROOT_* summaries (rebuilt separately by + /// `IndexDatabase::rebuild_root_summaries()`). Works for both the + /// RocksDB-backed writer and the SST writer. + void finalize_sink_only(indexer::IndexBatchSink& sink, int file_id); + + /// Emit per-checkpoint chunk records (bloom, stats, dim_stats, + /// name_chunk_postings) using the current `chunks_` buffer, merge their + /// state into the persistent file-level accumulator, then clear + /// `chunks_` and advance the base index. Used for mid-chunk slice + /// rotation when `chunks_` would otherwise grow unbounded. + void flush_per_checkpoint_to_sink(indexer::IndexBatchSink& sink, + int file_id); + + /// Emit file-level records (file_bloom, scalar_stats, counts, + /// dimensions, name_dictionary, name_file_postings) from the persistent + /// accumulator. Call once at end-of-file. + void finalize_file_to_sink(indexer::IndexBatchSink& sink, int file_id); + + std::size_t num_chunks() const { return chunks_base_idx_ + chunks_.size(); } + + /// Total event count across already-flushed chunks plus the currently + /// buffered ones. Reflects all events ingested via on_event() so far. + std::uint64_t total_events() const { + std::uint64_t total = file_acc_.statistics.total_events; + for (const auto& chunk : chunks_) { + total += chunk.statistics.total_events; + } + return total; + } + + private: + void ensure_chunk(std::size_t checkpoint_idx); + + ChunkIndexerConfig config_; + std::vector extra_dim_names_; + std::vector chunks_; + /// Number of checkpoints already flushed and dropped from `chunks_`. + /// `chunks_[i]` represents checkpoint `chunks_base_idx_ + i`. + std::size_t chunks_base_idx_ = 0; + + struct FileAccumulator { + std::array fixed_blooms; + std::vector extra_blooms; + ChunkStatistics statistics; + std::size_t num_chunks_emitted = 0; + bool initialized = false; + }; + FileAccumulator file_acc_; + + std::uint64_t last_pid_ = UINT64_MAX; + std::uint64_t last_tid_ = UINT64_MAX; + char last_pid_buf_[24] = {}; + char last_tid_buf_[24] = {}; + std::uint8_t last_pid_len_ = 0; + std::uint8_t last_tid_len_ = 0; +}; + +} // namespace dftracer::utils::utilities::composites::dft::visitors + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_BLOOM_VISITOR_H diff --git a/include/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h b/include/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h new file mode 100644 index 00000000..5281a06a --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h @@ -0,0 +1,57 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_HASH_TABLE_VISITOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_HASH_TABLE_VISITOR_H + +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer { +class IndexBatchSink; +} + +namespace dftracer::utils::utilities::composites::dft::visitors { + +/// Captures FH/HH/SH/PR metadata events during indexing and stores them +/// in HASH_TABLES column family with bidirectional lookups: +/// - Forward (hash -> name): for resolving hashes in output +/// - Reverse (name -> hash): for query DSL like `file_name == "/path/..."` +class HashTableVisitor : public DftEventVisitor { + public: + /// Hash table types matching dfanalyzer naming conventions + enum class HashType : std::uint8_t { + FILE = 0, // fhash <-> file_name + HOST = 1, // hhash <-> host_name + STRING = 2, // shash <-> string value + PROC = 3 // phash <-> proc metadata + }; + + HashTableVisitor() = default; + HashTableVisitor(const HashTableVisitor&) = delete; + HashTableVisitor& operator=(const HashTableVisitor&) = delete; + HashTableVisitor(HashTableVisitor&&) noexcept = default; + HashTableVisitor& operator=(HashTableVisitor&&) noexcept = default; + + void begin(std::size_t num_checkpoints) override; + void on_checkpoint(std::size_t checkpoint_idx) override; + void on_event(const EventRecord& record) override; + + std::unique_ptr create_parallel_slice() const override; + void merge_parallel_slice(DftEventVisitor& slice) override; + + void finalize(indexer::IndexBatchSink& writer, int file_id); + + std::size_t num_entries() const; + + private: + std::unordered_map file_hashes_; + std::unordered_map host_hashes_; + std::unordered_map string_hashes_; + std::unordered_map proc_metadata_; +}; + +} // namespace dftracer::utils::utilities::composites::dft::visitors + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_HASH_TABLE_VISITOR_H diff --git a/include/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h b/include/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h new file mode 100644 index 00000000..752e0448 --- /dev/null +++ b/include/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h @@ -0,0 +1,59 @@ +#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_MANIFEST_VISITOR_H +#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_MANIFEST_VISITOR_H + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer { +class IndexBatchSink; +} + +namespace dftracer::utils::utilities::composites::dft::visitors { + +class ManifestVisitor : public DftEventVisitor { + public: + ManifestVisitor() = default; + + void begin(std::size_t num_checkpoints) override; + void on_checkpoint(std::size_t checkpoint_idx) override; + void on_event(const EventRecord& record) override; + + std::unique_ptr create_parallel_slice() const override; + void merge_parallel_slice(DftEventVisitor& slice) override; + void set_line_offset(std::size_t offset) override { line_offset_ = offset; } + std::size_t parallel_event_count() const override { return event_count_; } + + void finalize(indexer::IndexBatchSink& writer, int file_id); + + /// Emit per-checkpoint event/metadata line records and clear the + /// vectors. Used for mid-chunk slice rotation. + void flush_per_checkpoint_to_sink(indexer::IndexBatchSink& sink, + int file_id); + + /// Emit file-level records (observed pids). Call once at end-of-file. + void finalize_file_to_sink(indexer::IndexBatchSink& sink, int file_id); + + private: + void ensure_chunk(std::size_t checkpoint_idx); + + using EventKey = std::pair; + using LineVec = std::vector; + + std::vector> event_lines_; + std::vector> metadata_lines_; + std::unordered_set observed_pids_; + std::size_t event_count_ = 0; + std::size_t line_offset_ = 0; + std::size_t base_idx_ = 0; +}; + +} // namespace dftracer::utils::utilities::composites::dft::visitors + +#endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_MANIFEST_VISITOR_H diff --git a/include/dftracer/utils/utilities/fileio/chunk_writer.h b/include/dftracer/utils/utilities/fileio/chunk_writer.h index d61f8472..7c4432e4 100644 --- a/include/dftracer/utils/utilities/fileio/chunk_writer.h +++ b/include/dftracer/utils/utilities/fileio/chunk_writer.h @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -20,6 +21,11 @@ struct ChunkWriterConfig { int compression_level = Z_DEFAULT_COMPRESSION; bool json_array_wrapper = true; + using ChunkRotationCallback = std::function; + ChunkRotationCallback on_chunk_complete; + ChunkWriterConfig& with_output_dir(std::string dir) { output_dir = std::move(dir); return *this; @@ -44,6 +50,10 @@ struct ChunkWriterConfig { json_array_wrapper = enabled; return *this; } + ChunkWriterConfig& with_on_chunk_complete(ChunkRotationCallback callback) { + on_chunk_complete = std::move(callback); + return *this; + } }; struct ChunkInfo { diff --git a/include/dftracer/utils/utilities/fileio/parallel/layout.h b/include/dftracer/utils/utilities/fileio/parallel/layout.h new file mode 100644 index 00000000..4a16e6e4 --- /dev/null +++ b/include/dftracer/utils/utilities/fileio/parallel/layout.h @@ -0,0 +1,59 @@ +#ifndef DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_LAYOUT_H +#define DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_LAYOUT_H + +#include +#include + +namespace dftracer::utils::utilities::fileio::parallel { + +enum class FileLayout { + SHARDED, // N files, glob by name; used on NFS + STRIPED, // single file, atomic-offset pwrite; used on local and PFS +}; + +enum class FilesystemKind { + UNKNOWN, + LOCAL, // ext4, xfs, btrfs, tmpfs, etc. + NFS, + LUSTRE, + GPFS, + BEEGFS, +}; + +struct LayoutInfo { + FileLayout layout; + FilesystemKind fs; + std::size_t stripe_size; // 0 if unknown/not applicable + std::size_t stripe_count; // 0 if unknown/not applicable +}; + +/// Detect layout for a path (the file need not exist yet; falls back to the +/// parent directory). NFS maps to SHARDED, everything else to STRIPED. +LayoutInfo detect_layout(const std::string& path) noexcept; + +struct WriterSizing { + std::size_t num_workers; + std::size_t flush_threshold; + std::size_t buffer_capacity; +}; + +/// Minimum stripe_size for which the padded-striped layout is worth picking. +/// Below this, compressed payloads may not reliably fit one stripe, so we +/// fall back to the atomic-offset striped writer. +constexpr std::size_t MIN_PADDED_STRIPE_BYTES = 1 * 1024 * 1024; + +/// Pure sizing policy. Worker count is capped by stripe_count on PFS. +/// For the atomic-offset striped writer, flush_threshold = max(default, +/// stripe_size) to keep pwrites large. For the padded striped writer, +/// flush_threshold = stripe_size so the compressed result fits in one stripe. +/// `baseline_workers` should already be capped at any caller-specific limit +/// (e.g. number of aggregation shards). +WriterSizing compute_writer_sizing(const LayoutInfo& info, + std::size_t baseline_workers, + std::size_t default_flush_bytes, + std::size_t buffer_headroom_bytes, + bool padded_layout = false) noexcept; + +} // namespace dftracer::utils::utilities::fileio::parallel + +#endif // DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_LAYOUT_H diff --git a/include/dftracer/utils/utilities/fileio/parallel/merge.h b/include/dftracer/utils/utilities/fileio/parallel/merge.h new file mode 100644 index 00000000..b5e4cc4d --- /dev/null +++ b/include/dftracer/utils/utilities/fileio/parallel/merge.h @@ -0,0 +1,20 @@ +#ifndef DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_MERGE_H +#define DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_MERGE_H + +#include + +#include +#include + +namespace dftracer::utils::utilities::fileio::parallel { + +/// Concatenate `shards` into `target` (truncating) and unlink the shards on +/// success. Valid for any format whose bytes concatenate cleanly (plain +/// JSON/NDJSON, gzip members). Shards are left in place on failure. +/// Returns 0 on success, -1 on any I/O failure. +coro::CoroTask merge_shards(const std::string& target, + const std::vector& shards); + +} // namespace dftracer::utils::utilities::fileio::parallel + +#endif // DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_MERGE_H diff --git a/include/dftracer/utils/utilities/fileio/parallel/parallel_writer.h b/include/dftracer/utils/utilities/fileio/parallel/parallel_writer.h new file mode 100644 index 00000000..3f1aeec0 --- /dev/null +++ b/include/dftracer/utils/utilities/fileio/parallel/parallel_writer.h @@ -0,0 +1,92 @@ +#ifndef DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_PARALLEL_WRITER_H +#define DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_PARALLEL_WRITER_H + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils { +class CoroScope; +} + +namespace dftracer::utils::utilities::fileio::parallel { + +/// Parallel file writer interface. Concrete impls (striped, sharded) hide the +/// on-disk layout; for gzip output the caller must feed standalone gzip +/// members so chunks stay valid at any offset. +class ParallelWriter { + public: + virtual ~ParallelWriter() = default; + + /// Create/truncate backing storage. `scope` may be null for layouts that + /// don't spawn internal coroutines; padded-striped requires a non-null + /// scope that outlives close(). + virtual coro::CoroTask open(std::string path, std::size_t num_workers, + bool gzip_extension, CoroScope* scope) = 0; + + /// Prologue, written before any worker chunk. + virtual coro::CoroTask write_header(ByteView data) = 0; + + /// Striped: placed at an atomic offset. Sharded: appended to shard N. + virtual coro::CoroTask write_chunk(std::size_t worker_idx, + ByteView data) = 0; + + /// Epilogue, written after all workers drain. + virtual coro::CoroTask write_footer(ByteView data) = 0; + + virtual coro::CoroTask close() = 0; + + /// One entry for striped; N entries (read order) for sharded. + virtual std::vector output_paths() const = 0; + + /// Per-write_chunk layout entry: byte offset + length of one independently + /// decompressable gzip member (or raw chunk for non-gzip layouts). + struct MemberSpan { + std::uint64_t offset; + std::uint64_t length; + }; + + /// Member offsets recorded by `write_chunk`, sorted by ascending offset. + /// Returned span is owned by the writer; valid until destruction. + /// Must be called after `close()` (no concurrent writes). + /// Empty for layouts that don't expose member boundaries. + virtual std::span member_layout() const { return {}; } + + /// Span of the most recent `write_chunk(worker_idx, ...)` call on this + /// worker. Caller must invoke immediately after `co_await write_chunk()` + /// returns; subsequent calls overwrite. For sharded layouts the offset + /// is shard-local; remap with `shard_base_offsets()` after close. + virtual std::optional last_member( + std::size_t /*worker_idx*/) const { + return std::nullopt; + } + + /// Per-worker base offset to add to a shard-local `MemberSpan.offset` to + /// get the merged-file offset. Empty by default (no remap needed for + /// single-stream layouts). Call after `close()`. + virtual std::vector shard_base_offsets() const { return {}; } +}; + +struct WriterConfig { + FileLayout layout = FileLayout::STRIPED; + std::size_t stripe_size = 0; // PFS stripe; 0 disables padded layout + bool gzip = false; +}; + +std::unique_ptr make_writer(const WriterConfig& cfg); +std::unique_ptr make_striped_writer(); +std::unique_ptr make_sharded_writer(); +std::unique_ptr make_padded_striped_writer( + std::size_t stripe_size); + +} // namespace dftracer::utils::utilities::fileio::parallel + +#endif // DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_PARALLEL_WRITER_H diff --git a/include/dftracer/utils/utilities/filesystem/directory_scanner_utility.h b/include/dftracer/utils/utilities/filesystem/directory_scanner_utility.h index 561c7486..5bec6dc7 100644 --- a/include/dftracer/utils/utilities/filesystem/directory_scanner_utility.h +++ b/include/dftracer/utils/utilities/filesystem/directory_scanner_utility.h @@ -3,6 +3,9 @@ #include #include +#include +#include +#include #include #include #include @@ -20,13 +23,16 @@ namespace dftracer::utils::utilities::filesystem { struct DirectoryScannerUtilityInput { fs::path path; bool recursive = false; // Whether to scan subdirectories + bool populate_size = true; - explicit DirectoryScannerUtilityInput(fs::path p, bool rec = false) - : path(std::move(p)), recursive(rec) {} + explicit DirectoryScannerUtilityInput(fs::path p, bool rec = false, + bool with_size = true) + : path(std::move(p)), recursive(rec), populate_size(with_size) {} // Equality operator for caching/hashing bool operator==(const DirectoryScannerUtilityInput& other) const { - return path == other.path && recursive == other.recursive; + return path == other.path && recursive == other.recursive && + populate_size == other.populate_size; } bool operator!=(const DirectoryScannerUtilityInput& other) const { @@ -56,9 +62,9 @@ struct DirectoryScannerUtilityInput { * @endcode */ class DirectoryScannerUtility - : public utilities::Utility, - utilities::tags::Parallelizable> { + : public utilities::Utility< + DirectoryScannerUtilityInput, std::vector, + utilities::tags::Parallelizable, utilities::tags::NeedsContext> { public: DirectoryScannerUtility() = default; ~DirectoryScannerUtility() = default; @@ -73,7 +79,7 @@ class DirectoryScannerUtility */ coro::CoroTask> process( const DirectoryScannerUtilityInput& input) override { - std::vector entries; + std::vector raw_entries; if (!fs::exists(input.path)) { throw fs::filesystem_error( @@ -91,15 +97,39 @@ class DirectoryScannerUtility // Recursive directory iteration for (const auto& entry : fs::recursive_directory_iterator(input.path)) { - entries.emplace_back(entry.path()); + raw_entries.push_back(entry); } } else { // Non-recursive directory iteration for (const auto& entry : fs::directory_iterator(input.path)) { - entries.emplace_back(entry.path()); + raw_entries.push_back(entry); } } + if (!this->has_context()) { + std::vector entries; + entries.reserve(raw_entries.size()); + for (const auto& entry : raw_entries) { + entries.emplace_back(entry, input.populate_size); + } + co_return entries; + } + + CoroScope& ctx = this->context(); + std::vector> tasks; + tasks.reserve(raw_entries.size()); + for (auto& entry : raw_entries) { + auto entry_copy = std::move(entry); + tasks.push_back( + ctx.spawn([entry_copy = std::move(entry_copy), + populate_size = input.populate_size]( + CoroScope&) mutable -> coro::CoroTask { + co_return FileEntry(entry_copy, populate_size); + })); + } + std::vector entries = + co_await coro::when_all(std::move(tasks)); + co_return entries; } }; diff --git a/include/dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h b/include/dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h index d934856f..e98a2685 100644 --- a/include/dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h +++ b/include/dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h @@ -2,6 +2,7 @@ #define DFTRACER_UTILS_UTILITIES_FILESYSTEM_PATTERN_DIRECTORY_SCANNER_H #include +#include #include #include #include @@ -18,14 +19,18 @@ namespace dftracer::utils::utilities::filesystem { struct PatternDirectoryScannerUtilityInput { std::string path; bool recursive = false; + bool populate_size = true; std::vector patterns; // e.g., {".pfw", ".pfw.gz", "*.txt"} PatternDirectoryScannerUtilityInput() = default; PatternDirectoryScannerUtilityInput(std::string p, std::vector pats, - bool rec = false) - : path(std::move(p)), recursive(rec), patterns(std::move(pats)) {} + bool rec = false, bool with_size = true) + : path(std::move(p)), + recursive(rec), + populate_size(with_size), + patterns(std::move(pats)) {} static PatternDirectoryScannerUtilityInput from_path(std::string p) { PatternDirectoryScannerUtilityInput input; @@ -43,6 +48,11 @@ struct PatternDirectoryScannerUtilityInput { recursive = rec; return *this; } + + PatternDirectoryScannerUtilityInput& with_populate_size(bool with_size) { + populate_size = with_size; + return *this; + } }; /** @@ -63,9 +73,9 @@ struct PatternDirectoryScannerUtilityInput { * @endcode */ class PatternDirectoryScannerUtility - : public utilities::Utility, - utilities::tags::Parallelizable> { + : public utilities::Utility< + PatternDirectoryScannerUtilityInput, std::vector, + utilities::tags::Parallelizable, utilities::tags::NeedsContext> { private: DirectoryScannerUtility base_scanner_; @@ -81,9 +91,15 @@ class PatternDirectoryScannerUtility coro::CoroTask> process( const PatternDirectoryScannerUtilityInput& input) override { // Step 1: Use base DirectoryScanner - DirectoryScannerUtilityInput dir_input{input.path, input.recursive}; - std::vector all_entries = - co_await base_scanner_.process(dir_input); + DirectoryScannerUtilityInput dir_input{input.path, input.recursive, + input.populate_size}; + std::vector all_entries; + if (this->has_context()) { + all_entries = + co_await this->context().spawn(base_scanner_, dir_input); + } else { + all_entries = co_await base_scanner_.process(dir_input); + } // Step 2: Filter by patterns std::vector matched_entries; diff --git a/include/dftracer/utils/utilities/filesystem/types.h b/include/dftracer/utils/utilities/filesystem/types.h index bff24a6d..f41ca47d 100644 --- a/include/dftracer/utils/utilities/filesystem/types.h +++ b/include/dftracer/utils/utilities/filesystem/types.h @@ -16,16 +16,29 @@ struct FileEntry { FileEntry() = default; - explicit FileEntry(const fs::path& p) + explicit FileEntry(const fs::path& p, bool populate_size = true) : path(p), size(0), is_directory(false), is_regular_file(false) { if (fs::exists(p)) { is_directory = fs::is_directory(p); is_regular_file = fs::is_regular_file(p); - if (is_regular_file) { + if (populate_size && is_regular_file) { size = fs::file_size(p); } } } + + explicit FileEntry(const fs::directory_entry& entry, + bool populate_size = true) + : path(entry.path()), + size(0), + is_directory(false), + is_regular_file(false) { + is_directory = entry.is_directory(); + is_regular_file = entry.is_regular_file(); + if (populate_size && is_regular_file) { + size = static_cast(entry.file_size()); + } + } }; } // namespace dftracer::utils::utilities::filesystem diff --git a/include/dftracer/utils/utilities/hash/fnv1a_hasher_utility.h b/include/dftracer/utils/utilities/hash/fnv1a_hasher_utility.h index 1ba7159b..6b8b6e88 100644 --- a/include/dftracer/utils/utilities/hash/fnv1a_hasher_utility.h +++ b/include/dftracer/utils/utilities/hash/fnv1a_hasher_utility.h @@ -9,6 +9,51 @@ namespace dftracer::utils::utilities::hash { +// FNV-1a constants +inline constexpr std::uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325ULL; +inline constexpr std::uint64_t FNV1A_PRIME = 0x00000100000001B3ULL; + +// Simple FNV-1a 64-bit hash for one-shot use +inline std::uint64_t fnv1a_hash(const void* data, std::size_t len) { + std::uint64_t hash = FNV1A_OFFSET_BASIS; + const auto* bytes = static_cast(data); + for (std::size_t i = 0; i < len; ++i) { + hash ^= bytes[i]; + hash *= FNV1A_PRIME; + } + return hash; +} + +inline std::uint64_t fnv1a_hash(std::string_view data) { + return fnv1a_hash(data.data(), data.size()); +} + +// Incremental hash builder for combining multiple values +struct Fnv1aHashBuilder { + std::uint64_t state = FNV1A_OFFSET_BASIS; + + void update(const void* data, std::size_t len) { + const auto* bytes = static_cast(data); + for (std::size_t i = 0; i < len; ++i) { + state ^= bytes[i]; + state *= FNV1A_PRIME; + } + } + + void update(std::string_view data) { update(data.data(), data.size()); } + + template + void update_value(const T& val) { + update(&val, sizeof(val)); + } + + std::uint64_t finish() const { return state; } + std::uint32_t finish32() const { + // XOR-fold 64-bit to 32-bit + return static_cast(state ^ (state >> 32)); + } +}; + /** * @brief FNV-1a 64-bit streaming hasher utility. * @@ -18,10 +63,7 @@ namespace dftracer::utils::utilities::hash { */ class Fnv1aHasherUtility : public internal::BaseHasherUtility { private: - static constexpr std::uint64_t FNV_OFFSET_BASIS = 0xcbf29ce484222325ULL; - static constexpr std::uint64_t FNV_PRIME = 0x00000100000001B3ULL; - - std::uint64_t state_ = FNV_OFFSET_BASIS; + std::uint64_t state_ = FNV1A_OFFSET_BASIS; public: Fnv1aHasherUtility() { reset(); } @@ -29,14 +71,14 @@ class Fnv1aHasherUtility : public internal::BaseHasherUtility { ~Fnv1aHasherUtility() override = default; void reset() override { - state_ = FNV_OFFSET_BASIS; + state_ = FNV1A_OFFSET_BASIS; current_hash_ = Hash{0}; } void update(std::string_view data) override { for (unsigned char c : data) { state_ ^= c; - state_ *= FNV_PRIME; + state_ *= FNV1A_PRIME; } current_hash_ = Hash{static_cast(state_)}; } diff --git a/include/dftracer/utils/utilities/indexer/file_partition.h b/include/dftracer/utils/utilities/indexer/file_partition.h new file mode 100644 index 00000000..6d0d0dd4 --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/file_partition.h @@ -0,0 +1,53 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_FILE_PARTITION_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_FILE_PARTITION_H + +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer { + +/// Greedy Longest-Processing-Time-first (LPT) bin-packing of files into +/// `num_workers` partitions, minimising the maximum per-worker total size. +/// +/// Used to eliminate straggler tails in the distributed indexer: workers +/// see file lists whose total bytes are as balanced as possible. +/// +/// Complexity: O(N log N) for the initial sort + O(N log K) for the +/// min-heap over K = num_workers. `files` is consumed. +inline std::vector> plan_lpt_partition( + std::vector files, std::size_t num_workers) { + if (num_workers == 0) num_workers = 1; + + std::vector> buckets(num_workers); + if (files.empty()) return buckets; + + std::sort(files.begin(), files.end(), + [](const auto& a, const auto& b) { return a.size > b.size; }); + + // Min-heap of (total_size, bucket_idx): next file goes to the currently + // lightest bucket. + using HeapEntry = std::pair; + std::priority_queue, std::greater<>> heap; + for (std::size_t i = 0; i < num_workers; ++i) { + heap.emplace(0, i); + } + + for (auto& entry : files) { + auto [total, idx] = heap.top(); + heap.pop(); + total += entry.size; + buckets[idx].push_back(std::move(entry)); + heap.emplace(total, idx); + } + + return buckets; +} + +} // namespace dftracer::utils::utilities::indexer + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_FILE_PARTITION_H diff --git a/include/dftracer/utils/utilities/indexer/index_batch_sink.h b/include/dftracer/utils/utilities/indexer/index_batch_sink.h new file mode 100644 index 00000000..28f68bc2 --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/index_batch_sink.h @@ -0,0 +1,155 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_BATCH_SINK_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_BATCH_SINK_H + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer { + +/// Abstract sink that accepts index records for a batch of files. +/// +/// Two backends implement this: +/// - `IndexDatabaseWriterContext`: writes directly to a live RocksDB. +/// - `IndexDatabaseSstWriterContext`: writes to SST files for later bulk +/// ingest (process-safe fan-out for distributed indexing). +/// +/// Only the step-1 subset of methods is abstracted here (file metadata, +/// checkpoints, manifest event ranges and metadata lines). Bloom/hash/stats +/// writes remain on the concrete type until their CFs are ported to SST. +class IndexBatchSink { + public: + using IndexerCheckpoint = internal::IndexerCheckpoint; + using ChunkStatistics = composites::dft::indexing::ChunkStatistics; + using ChunkDimensionStats = composites::dft::indexing::ChunkDimensionStats; + + virtual ~IndexBatchSink() = default; + + virtual void insert_file_metadata(int file_id, + std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size) = 0; + + virtual void insert_checkpoint(int file_id, + const IndexerCheckpoint& checkpoint) = 0; + + virtual void insert_event_range( + int file_id, std::uint64_t checkpoint_idx, std::string_view cat, + std::string_view name, std::span line_numbers) = 0; + + virtual void insert_metadata_lines( + int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type, + std::span line_numbers) = 0; + + virtual void insert_file_pids( + int file_id, const std::unordered_set& pids) = 0; + + // Bloom / stats / dimension CFs -------------------------------------- + + virtual void insert_chunk_bloom_filter( + int file_id, std::uint64_t checkpoint_idx, std::string_view dimension, + std::span blob_data, + std::uint64_t num_entries) = 0; + + virtual void insert_file_bloom_filter( + int file_id, std::string_view dimension, + std::span blob_data, + std::uint64_t num_entries) = 0; + + virtual void insert_chunk_statistics(int file_id, + std::uint64_t checkpoint_idx, + const ChunkStatistics& stats) = 0; + + virtual void insert_file_scalar_stats(int file_id, + const ChunkStatistics& stats, + std::uint64_t num_chunks) = 0; + + virtual void insert_file_category_counts( + int file_id, const StringViewMap& counts) = 0; + + virtual void insert_file_pid_tid_counts( + int file_id, const StringViewMap& counts) = 0; + + virtual void insert_file_name_counts( + int file_id, const StringViewMap& counts) = 0; + + virtual void insert_index_dimension(int file_id, + std::string_view dimension) = 0; + + virtual void insert_chunk_dimension_stats( + int file_id, std::uint64_t checkpoint_idx, + const ChunkDimensionStats& stats, + std::size_t value_counts_cap = 4096) = 0; + + // Name dictionary + postings. `name_id` is a 64-bit FNV1a hash of `name` + // (deterministic, stateless), so multiple workers can emit the same + // (name_id, name) pair without coordination. Dictionary duplicates are + // dropped via `ingest_behind=true` at bulk-ingest time. Posting keys + // include the file_id, which is worker-disjoint, so they need no + // coordination either. + virtual void insert_name_dictionary_entry(std::uint64_t name_id, + std::string_view name) = 0; + + virtual void insert_name_file_posting(std::uint64_t name_id, + int file_id) = 0; + + virtual void insert_name_chunk_posting(std::uint64_t name_id, int file_id, + std::uint64_t checkpoint_idx) = 0; + + // Content-addressed hash table (FH/HH/SH/PR). Writes both the forward + // (hash -> name) and reverse (name -> hash) entries. Deterministic keys + // mean different workers emit identical (key, value) pairs for shared + // hashes; cross-worker duplicates are resolved at read time via the LSM + // sequence number. + virtual void insert_hash_table_entry(std::uint8_t type, + std::string_view hash, + std::string_view name) = 0; + + // Aggregation column family. The AGGREGATION CF holds a mix of + // Merge-operand records (per-`(pid, time_bucket, ...)` aggregated + // stats) and Put records (intern-dictionary entries using the + // AGG_INTERN_DICT_PREFIX prefix, global-config key, per-file + // completion markers, and EventAggregator finalization metadata). + // A rocksdb merge_operator collapses Merge operands at read/compaction + // time; the concrete writer routes to `db_->merge` / `db_->put` via + // the shared WriteBatch, the SST writer buffers `(key, value, op_kind)` + // tuples and emits a mixed-op SST on commit. + virtual void insert_aggregation_merge(std::string_view key, + std::string_view operand) = 0; + + virtual void insert_aggregation_put(std::string_view key, + std::string_view value) = 0; + + // System-metrics column family. Merge-operand only in practice (no + // intern dictionary sidecar). + virtual void insert_system_metrics_merge(std::string_view key, + std::string_view operand) = 0; + + // Convenience overloads forwarding to span variants; concrete classes + // need not override. + void insert_event_range(int file_id, std::uint64_t checkpoint_idx, + std::string_view cat, std::string_view name, + const std::vector& line_numbers) { + insert_event_range(file_id, checkpoint_idx, cat, name, + std::span(line_numbers)); + } + + void insert_metadata_lines(int file_id, std::uint64_t checkpoint_idx, + std::string_view meta_type, + const std::vector& line_numbers) { + insert_metadata_lines(file_id, checkpoint_idx, meta_type, + std::span(line_numbers)); + } +}; + +} // namespace dftracer::utils::utilities::indexer + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_BATCH_SINK_H diff --git a/include/dftracer/utils/utilities/indexer/index_builder_utility.h b/include/dftracer/utils/utilities/indexer/index_builder_utility.h index cf86a8af..bca20b15 100644 --- a/include/dftracer/utils/utilities/indexer/index_builder_utility.h +++ b/include/dftracer/utils/utilities/indexer/index_builder_utility.h @@ -5,36 +5,51 @@ #include #include #include +#include #include +#include +#include +#include #include +#include +#include +#include #include +#include #include +namespace dftracer::utils { +class CoroScope; +} // namespace dftracer::utils + namespace dftracer::utils::utilities::indexer { -inline std::vector default_bloom_dimensions() { - return {"name", "cat", "pid", "tid", "hhash", "fhash", "shash"}; -} +using dftracer::utils::CoroScope; + +inline constexpr std::array DEFAULT_BLOOM_DIMENSIONS = { + "name", "cat", "pid", "tid", "hhash", "fhash", "shash", +}; + +inline constexpr std::array DEFAULT_EXTRA_DIMENSIONS = { + "ret", "count", "offset", "epoch", "step", +}; struct IndexBuildConfig { std::string file_path; std::string index_dir; std::size_t checkpoint_size = 32 * 1024 * 1024; - std::size_t index_threshold = - constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; bool force_rebuild = false; - bool build_bloom = false; bool build_manifest = false; composites::dft::indexing::ChunkIndexerConfig bloom_config; std::vector bloom_dimensions; + std::vector> + extra_dft_visitors; static IndexBuildConfig for_file(const std::string& path); IndexBuildConfig& with_index_dir(const std::string& dir); IndexBuildConfig& with_checkpoint_size(std::size_t size); - IndexBuildConfig& with_index_threshold(std::size_t threshold); IndexBuildConfig& with_force_rebuild(bool force); - IndexBuildConfig& with_bloom(bool enable = true); IndexBuildConfig& with_manifest(bool enable = true); IndexBuildConfig& with_bloom_config( const composites::dft::indexing::ChunkIndexerConfig& config); @@ -53,6 +68,103 @@ struct IndexBuildResult { std::string error_message; }; +struct IndexBuildBatchConfig { + std::vector file_paths; + std::string index_dir; + std::size_t checkpoint_size = 32 * 1024 * 1024; + std::size_t parallelism = 1; + bool force_rebuild = false; + bool build_manifest = false; + composites::dft::indexing::ChunkIndexerConfig bloom_config; + std::vector bloom_dimensions; + bool use_batch_write = true; + bool rebuild_root_summaries = true; + + /// If > 0, process files in sub-batches of this size, flushing parsed + /// artifacts to the write phase between sub-batches. Bounds peak memory + /// to ~flush_every_files worth of ParsedBloomJob state. 0 = no flush + /// (all files parsed before any write). + std::size_t flush_every_files = 0; + + /// Factory for creating per-file DftEventVisitors during the parse phase. + /// Called once per file with the file path. Caller owns the returned + /// visitors and can extract results after the batch completes. + using DftVisitorFactory = std::function< + std::vector>( + const std::string& file_path)>; + DftVisitorFactory dft_visitor_factory; + + /// Optional drain callback invoked once per sub-batch with the extra + /// visitors for that sub-batch's files. Lets the caller consume and + /// release visitor state immediately, keeping memory bounded by + /// flush_every_files instead of accumulating across the whole pipeline. + using ExtraVisitorsDrainFn = std::function>>)>; + ExtraVisitorsDrainFn extra_visitors_drain; + + /// If non-empty, parallel to `file_paths`: use these file_ids instead + /// of allocating via `get_or_create_file_info`. Used by the distributed + /// indexer where the coordinator pre-registers all files. When set, + /// the write phase skips the DEFAULT-CF registry open/write step. + std::vector preassigned_file_ids; + + /// Optional per-file member slice (cross-rank file splitting). When + /// non-empty, must be parallel to `file_paths`. A null/empty entry + /// means "process the whole file"; a populated entry restricts the + /// build to `[member_begin, member_end)`. The `members` vector must + /// outlive the batch (typically stored in a shared member map). + struct FileSlice { + const std::vector* members = nullptr; + std::size_t member_begin = 0; + std::size_t member_end = 0; + std::uint64_t checkpoint_idx_base = 0; + /// When true, this file's file-scoped data (checkpoints, + /// bloom/manifest/hashtable, file_metadata) is NOT persisted by + /// the write phase. Aggregation/system-metrics SSTs produced by + /// extra visitors are still collected. Set by the MPI driver for + /// sliced ranks where `member_begin > 0` to avoid cross-rank key + /// collisions on file-scoped CFs. + bool skip_file_scoped_writes = false; + }; + std::vector file_slices; + + /// Optional batch-sink factory. If set, the write phase constructs a + /// fresh sink per batch via this factory instead of opening the + /// RocksDB-backed writer on `index_dir`. Used by the distributed (SST) + /// pipeline to route writes to per-worker SstWriterContext instances. + /// `sink_commit` must also be set and is responsible for finalising + /// each sink (RocksDB path: call .commit(); SST path: flush + route + /// Artifacts to a registry). + using SinkFactory = std::function()>; + using SinkCommitFn = std::function; + SinkFactory sink_factory; + SinkCommitFn sink_commit; +}; + +struct IndexBuildBatchMetrics { + std::uint64_t parse_ns = 0; + std::uint64_t write_ns = 0; + std::size_t files_enqueued = 0; + std::size_t files_parsed = 0; + std::size_t files_written = 0; +}; + +struct IndexBuildBatchResult { + std::vector results; + std::size_t indexed = 0; + std::size_t skipped = 0; + std::size_t failed = 0; + std::uint64_t total_events = 0; + IndexBuildBatchMetrics metrics; + + /// Per-file extra visitors created by dft_visitor_factory during parsing. + /// Index corresponds to the file index in the original file_paths vector. + /// Empty vectors for files that failed or had no factory. + std::vector>> + extra_visitors; +}; + class IndexBuilderUtility : public Utility { public: @@ -60,6 +172,12 @@ class IndexBuilderUtility const IndexBuildConfig& config) override; }; +class IndexBatchBuilderUtility { + public: + static coro::CoroTask process( + CoroScope* scope, std::shared_ptr config); +}; + } // namespace dftracer::utils::utilities::indexer #endif // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_BUILDER_UTILITY_H diff --git a/include/dftracer/utils/utilities/indexer/index_database.h b/include/dftracer/utils/utilities/indexer/index_database.h index 76846a7c..bf79d741 100644 --- a/include/dftracer/utils/utilities/indexer/index_database.h +++ b/include/dftracer/utils/utilities/indexer/index_database.h @@ -3,67 +3,25 @@ #include #include -#include -#include -#include -#include -#include +#include #include #include #include -#include +#include #include #include #include +#include #include namespace dftracer::utils::utilities::indexer { -/** - * @brief Unified `.dftindex` RocksDB store combining checkpoint, bloom - * filter, manifest, and archive metadata. - * - * Schema is additive: call init_base_schema() always, then - * init_bloom_schema() and/or init_manifest_schema() as needed. - * - * All query/insert/delete operations are exposed as methods so callers - * never need to use the queries:: namespace directly. - */ +class IndexDatabaseWriterContext; +class SstArtifactRegistry; + class IndexDatabase { public: - // Re-export result types so callers don't need query headers - using ChunkBloomResult = - composites::dft::indexing::queries::ChunkBloomResult; - using FileBloomResult = composites::dft::indexing::queries::FileBloomResult; - using ChunkStatisticsResult = - composites::dft::indexing::queries::ChunkStatisticsResult; - using TimeBounds = composites::dft::indexing::queries::TimeBounds; - using EventRangeResult = - composites::dft::indexing::queries::EventRangeResult; - using MetadataLinesResult = - composites::dft::indexing::queries::MetadataLinesResult; - using ChunkStatistics = composites::dft::indexing::ChunkStatistics; - using ChunkDimensionStats = composites::dft::indexing::ChunkDimensionStats; - using ChunkDimensionStatsResult = - composites::dft::indexing::ChunkDimensionStatsResult; - using IndexerCheckpoint = internal::IndexerCheckpoint; - struct TarArchiveMetadata { - std::string archive_name; - std::uint64_t checkpoint_size = 0; - std::uint64_t total_lines = 0; - std::uint64_t total_uc_size = 0; - std::uint64_t total_files = 0; - }; - struct TarFileRecord { - std::string file_name; - std::uint64_t file_size = 0; - std::uint64_t file_mtime = 0; - char typeflag = '\0'; - std::uint64_t data_offset = 0; - std::uint64_t uncompressed_offset = 0; - }; - explicit IndexDatabase( const std::string& index_path, dftracer::utils::rocksdb::RocksDatabase::OpenMode open_mode = @@ -77,84 +35,94 @@ class IndexDatabase { ~IndexDatabase() = default; - // Schema initialisation — idempotent (CREATE TABLE IF NOT EXISTS) - void init_base_schema(); - void init_bloom_schema(); - void init_manifest_schema(); + std::unique_ptr begin_write(); + + /// Ingest SST files produced by `IndexDatabaseSstWriterContext` instances. + /// File-id ranges across input SSTs must be disjoint for the METADATA, + /// CHECKPOINTS, and MANIFEST column families (step-1 scope). No-op on an + /// empty registry. Does NOT refresh root summaries; call + /// `rebuild_root_summaries()` afterward once all ingest phases are done. + /// + /// `skip_cfs` optionally holds CF names (e.g. `cf::AGGREGATION`) whose + /// SSTs must be left outside the unified DB. Distributed builds use this + /// to keep per-worker AGGREGATION / SYSTEM_METRICS SSTs addressable by + /// manifest for parallel reads at analyze time. + void bulk_ingest(const SstArtifactRegistry& registry, + const std::unordered_set& skip_cfs = {}); + + /// Recompute ROOT_SCALAR_STATS, ROOT_{CAT,NAME,PID_TID}_COUNTS from the + /// current per-file CFs. Call after `bulk_ingest` completes, or whenever + /// root-level summaries need to be regenerated from scratch. + void rebuild_root_summaries(); + + /// Write the aggregation global-config key (0xFFFE) into the + /// AGGREGATION CF. Required for `iter_arrow_dfanalyzer_all` to recognise + /// the index as aggregator-populated. Distributed builds call this after + /// `bulk_ingest(skip_cfs={aggregation, system_metrics})` so the unified + /// DB has a config marker even though the AGG SSTs live in the manifest. + /// `consolidate_index` invokes it too before the deferred AGG ingest. + void write_agg_global_config(std::uint64_t time_interval_us, + std::uint32_t config_hash = 0); + + /// Write per-file aggregation completion markers (0xFFFF + file_id BE) + /// into the AGGREGATION CF. The index resolver treats these as "this + /// file has aggregated data"; without them, `ensure_indexed()` concludes + /// the aggregation tier is incomplete and re-runs the build. Distributed + /// builds must call this after `bulk_ingest`, since per-worker SSTs + /// carry data but not markers (markers are written via direct db->put, + /// not via the SST sink). + void write_agg_file_markers(const std::vector& file_ids); + + /// Merge per-worker AssociationTracker blobs and write the result to + /// the AGGREGATION CF under the `__tracker__` key. + void write_aggregation_tracker(const std::vector& blobs); + + /// Atomically reserve `count` contiguous file_ids, returning the first + /// id in the range `[first, first + count)`. Intended for the + /// distributed indexer: coordinator hands each worker its own disjoint + /// range up front so workers need no cross-worker coordination. + int reserve_file_id_range(std::size_t count); + + /// Register a list of trace files in the DEFAULT-CF file registry and + /// return the assigned file_ids (parallel to `file_paths`). Idempotent: + /// files already registered with a matching hash keep their existing + /// id. Used by the distributed indexer's coordinator to pre-register + /// every file before dispatching work to SST-backed workers, so workers + /// never need to touch the DEFAULT column family themselves. + std::vector register_files(const std::vector& file_paths, + bool build_manifest); + + std::shared_ptr db() const { + return db_; + } + + // Schema initialisation, idempotent + void init_schema(); + + // ----------------------------------------------------------------------- + // Read-only query API + // ----------------------------------------------------------------------- + + IndexFileEntryCapability get_file_capabilities(int file_id) const; - // Query helpers bool has_bloom_data(int file_id) const; bool has_manifest_data(int file_id) const; - int get_or_create_file_info(std::string_view path, std::uint64_t file_hash); int get_file_info_id(std::string_view path) const; std::optional get_file_hash(std::string_view path) const; + std::unordered_map query_all_file_info_ids() const; + std::unordered_map query_all_file_registry() + const; + std::unordered_set query_files_with_file_scalar_stats() const; + std::unordered_set query_files_with_bloom_data() const; - // Convenience: resolve file path to file_id (handles logical path) int find_file(std::string_view file_path) const; - // Metadata queries - void insert_file_metadata(int file_id, std::uint64_t checkpoint_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size); std::uint64_t get_checkpoint_size(int file_id) const; std::uint64_t get_num_lines(int file_id) const; std::uint64_t get_max_bytes(int file_id) const; - - // Returns exact event count from chunk_statistics if bloom was built, - // otherwise falls back to num_lines (approximate). std::uint64_t get_total_events(int file_id) const; - void begin_transaction(); - void commit_transaction(); - void rollback_transaction() noexcept; - - // ----------------------------------------------------------------------- - // Bloom insert operations - // ----------------------------------------------------------------------- - - void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx, - std::string_view dimension, - std::span blob_data, - std::uint64_t num_entries); - - void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx, - std::string_view dimension, - const void* blob_data, int blob_size, - std::uint64_t num_entries); - - void insert_file_bloom_filter(int file_id, std::string_view dimension, - std::span blob_data, - std::uint64_t num_entries); - - void insert_file_bloom_filter(int file_id, std::string_view dimension, - const void* blob_data, int blob_size, - std::uint64_t num_entries); - - void insert_chunk_statistics(int file_id, std::uint64_t checkpoint_idx, - const ChunkStatistics& stats); - void insert_checkpoint(int file_id, const IndexerCheckpoint& checkpoint); - - void insert_index_dimension(int file_id, std::string_view dimension); - - void insert_hash_resolution(int file_id, std::string_view dimension, - std::string_view hash_value, - std::string_view resolved_value); - - void insert_chunk_dimension_stats(int file_id, std::uint64_t checkpoint_idx, - const ChunkDimensionStats& stats, - std::size_t value_counts_cap = 4096); - void insert_tar_archive_metadata(int file_id, std::string_view archive_name, - std::uint64_t checkpoint_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size, - std::uint64_t total_files); - void insert_tar_file(int file_id, const TarFileRecord& record); - - // ----------------------------------------------------------------------- - // Bloom query operations - // ----------------------------------------------------------------------- - std::vector query_chunk_bloom_filters( int file_id, std::string_view dimension) const; @@ -175,6 +143,40 @@ class IndexDatabase { std::vector query_chunk_statistics( int file_id) const; + std::unordered_map> + query_chunk_statistics_batch(const std::vector& file_ids) const; + std::unordered_map + query_merged_statistics_batch(const std::vector& file_ids) const; + std::unordered_map + query_file_scalar_stats_batch(const std::vector& file_ids) const; + std::unordered_map query_file_metadata_batch( + const std::vector& file_ids) const; + std::unordered_map> + query_file_category_counts_batch(const std::vector& file_ids) const; + std::unordered_map> + query_file_pid_tid_counts_batch(const std::vector& file_ids) const; + std::unordered_map query_file_name_summaries_batch( + const std::vector& file_ids) const; + std::optional query_root_scalar_stats() const; + StringViewMap query_root_category_counts() const; + StringViewMap query_root_pid_tid_counts() const; + StringViewMap query_root_name_counts() const; + void merge_file_category_counts_batch_into( + const std::vector& file_ids, + std::unordered_map& targets) const; + void merge_file_pid_tid_counts_batch_into( + const std::vector& file_ids, + std::unordered_map& targets) const; + void merge_file_name_counts_batch_into( + const std::vector& file_ids, + std::unordered_map& targets) const; + void merge_root_category_counts_into(ChunkStatistics& target) const; + void merge_root_pid_tid_counts_into(ChunkStatistics& target) const; + void merge_root_name_counts_into(ChunkStatistics& target) const; + std::vector query_name_file_postings(std::string_view name) const; + std::vector query_name_chunk_postings(std::string_view name, + int file_id) const; + bool has_file_scalar_stats(int file_id) const; bool find_checkpoint(int file_id, std::size_t target_offset, IndexerCheckpoint& checkpoint) const; std::vector query_checkpoints(int file_id) const; @@ -193,76 +195,89 @@ class IndexDatabase { std::vector query_chunk_dimension_stats( int file_id) const; + std::unordered_map> + query_chunk_dimension_stats_batch(const std::vector& file_ids) const; std::vector query_chunk_dimension_stats_for_dimension(int file_id, std::string_view dimension) const; - // Global queries (search across all files) - std::optional query_resolved_by_hash( - std::string_view dimension, std::string_view hash_value) const; + std::optional query_name_id(std::string_view name) const; + std::optional query_name_by_id(std::uint64_t name_id) const; - std::vector query_hash_by_resolved( - std::string_view dimension, std::string_view resolved_value) const; + std::vector query_event_ranges(int file_id) const; - // ----------------------------------------------------------------------- - // Bloom delete operations - // ----------------------------------------------------------------------- + std::vector query_event_ranges_for_checkpoint( + int file_id, std::uint64_t checkpoint_idx) const; + + std::vector query_metadata_lines(int file_id) const; - void delete_chunk_bloom_filters(int file_id, std::string_view dimension); - void delete_file_bloom_filter(int file_id, std::string_view dimension); - void delete_chunk_statistics(int file_id); - void delete_chunk_dimension_stats(int file_id); - void delete_hash_resolutions(int file_id); + std::vector query_metadata_lines_for_checkpoint( + int file_id, std::uint64_t checkpoint_idx) const; // ----------------------------------------------------------------------- - // Manifest insert operations + // PID manifest query API (for distributed aggregation) // ----------------------------------------------------------------------- - void insert_event_range(int file_id, std::uint64_t checkpoint_idx, - std::string_view cat, std::string_view name, - std::span line_numbers); + /// Query the set of PIDs observed in a specific file. + std::unordered_set query_file_pids(int file_id) const; - void insert_event_range(int file_id, std::uint64_t checkpoint_idx, - std::string_view cat, std::string_view name, - const std::vector& line_numbers); - - void insert_metadata_lines(int file_id, std::uint64_t checkpoint_idx, - std::string_view meta_type, - std::span line_numbers); - - void insert_metadata_lines(int file_id, std::uint64_t checkpoint_idx, - std::string_view meta_type, - const std::vector& line_numbers); + /// Query the PIDs for all files at once. + /// Returns {file_id -> set of PIDs}. + std::unordered_map> + query_all_file_pids() const; // ----------------------------------------------------------------------- - // Manifest query operations + // Hash table query API (FH/HH/SH/PR mappings) // ----------------------------------------------------------------------- - std::vector query_event_ranges(int file_id) const; + enum class HashType : std::uint8_t { + FILE = 0, // FH: file hash -> file name + HOST = 1, // HH: host hash -> host name + STRING = 2, // SH: string hash -> string value + PROC = 3 // PR: proc hash -> proc metadata + }; - std::vector query_event_ranges_for_checkpoint( - int file_id, std::uint64_t checkpoint_idx) const; + /// Query all entries of a given hash type. + /// Returns map of {hash_value -> resolved_name}. + std::unordered_map query_hash_table( + HashType type) const; - std::vector query_metadata_lines(int file_id) const; + /// Resolve a single hash to its name. + /// Returns nullopt if hash is not found. + std::optional resolve_hash(HashType type, + std::string_view hash) const; - std::vector query_metadata_lines_for_checkpoint( - int file_id, std::uint64_t checkpoint_idx) const; + /// Query all hash tables at once. + /// Returns {type -> {hash -> name}}. + std::unordered_map> + query_all_hash_tables() const; - // ----------------------------------------------------------------------- - // Manifest delete operations - // ----------------------------------------------------------------------- - - void delete_event_ranges(int file_id); - void delete_metadata_lines(int file_id); + /// Resolve a name to its hash (reverse lookup for query DSL). + /// Returns nullopt if name is not found. + std::optional resolve_name_to_hash( + HashType type, std::string_view name) const; private: - void delete_file_data(int file_id); + void ensure_hash_tables_cached() const; std::string db_path_; dftracer::utils::rocksdb::RocksDatabase::OpenMode open_mode_; std::shared_ptr db_; - std::unique_ptr txn_batch_; + + struct HashCache { + std::shared_mutex mutex; + bool loaded = false; + std::unordered_map file_hash; + std::unordered_map host_hash; + std::unordered_map string_hash; + std::unordered_map proc_hash; + std::unordered_map file_name; + std::unordered_map host_name; + std::unordered_map string_name; + std::unordered_map proc_name; + }; + mutable std::unique_ptr hash_cache_; }; } // namespace dftracer::utils::utilities::indexer diff --git a/include/dftracer/utils/utilities/indexer/index_database_sst_writer_context.h b/include/dftracer/utils/utilities/indexer/index_database_sst_writer_context.h new file mode 100644 index 00000000..7ec980a8 --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/index_database_sst_writer_context.h @@ -0,0 +1,303 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_SST_WRITER_CONTEXT_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_SST_WRITER_CONTEXT_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer { + +/// Per-batch SST emitter that implements `IndexBatchSink` by buffering +/// (key, value) pairs in memory per column family and flushing them to sorted +/// SST files on `commit()`. +/// +/// Usage is identical to `IndexDatabaseWriterContext`: construct one per +/// batch, call the `insert_*` methods, then `commit()`. The returned +/// `Artifacts` hold the paths of the SST files produced, which a coordinator +/// later ingests via `IndexDatabase::bulk_ingest()`. +/// +/// Process-safe: holds no RocksDB handle. Many contexts run concurrently +/// across threads or processes, provided each is given a disjoint file_id +/// range so SST key prefixes do not overlap. +class IndexDatabaseSstWriterContext : public IndexBatchSink { + public: + struct Artifacts { + std::optional metadata_sst; + std::optional checkpoints_sst; + std::optional manifest_sst; + std::optional chunk_bloom_sst; + std::optional file_bloom_sst; + std::optional chunk_stats_sst; + std::optional chunk_dim_stats_sst; + std::optional dimensions_sst; + std::optional file_scalar_stats_sst; + std::optional file_cat_counts_sst; + std::optional file_pid_tid_counts_sst; + std::optional file_name_counts_sst; + std::optional name_dictionary_sst; + std::optional name_file_postings_sst; + std::optional name_chunk_postings_sst; + std::optional hash_tables_sst; + std::optional aggregation_sst; + std::optional system_metrics_sst; + + bool empty() const noexcept { + return !metadata_sst.has_value() && !checkpoints_sst.has_value() && + !manifest_sst.has_value() && !chunk_bloom_sst.has_value() && + !file_bloom_sst.has_value() && + !chunk_stats_sst.has_value() && + !chunk_dim_stats_sst.has_value() && + !dimensions_sst.has_value() && + !file_scalar_stats_sst.has_value() && + !file_cat_counts_sst.has_value() && + !file_pid_tid_counts_sst.has_value() && + !file_name_counts_sst.has_value() && + !name_dictionary_sst.has_value() && + !name_file_postings_sst.has_value() && + !name_chunk_postings_sst.has_value() && + !hash_tables_sst.has_value() && + !aggregation_sst.has_value() && + !system_metrics_sst.has_value(); + } + + /// Move every populated SST file to `dest_dir` (created if missing) + /// and return a new Artifacts whose paths point at the new location. + /// Uses `fs::rename` when src and dst resolve to the same filesystem + /// (O(1), atomic) and falls back to copy + unlink across filesystems. + /// Intended for the node-local -> shared FS handoff in the + /// distributed indexer. Rvalue-qualified: the original Artifacts is + /// left empty. + Artifacts move_to(std::string_view dest_dir) &&; + }; + + /// Build SSTs into a unique subdirectory under `staging_dir`. `batch_id` + /// must be unique across concurrent writers pointing at the same staging + /// root so paths do not collide. + IndexDatabaseSstWriterContext(std::string staging_dir, + std::string batch_id); + + IndexDatabaseSstWriterContext(const IndexDatabaseSstWriterContext&) = + delete; + IndexDatabaseSstWriterContext& operator=( + const IndexDatabaseSstWriterContext&) = delete; + + IndexDatabaseSstWriterContext(IndexDatabaseSstWriterContext&&) noexcept; + IndexDatabaseSstWriterContext& operator=( + IndexDatabaseSstWriterContext&&) noexcept; + + ~IndexDatabaseSstWriterContext() override; + + using IndexBatchSink::insert_event_range; + using IndexBatchSink::insert_metadata_lines; + + void insert_file_metadata(int file_id, std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size) override; + + void insert_checkpoint(int file_id, + const IndexerCheckpoint& checkpoint) override; + + void insert_event_range( + int file_id, std::uint64_t checkpoint_idx, std::string_view cat, + std::string_view name, + std::span line_numbers) override; + + void insert_metadata_lines( + int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type, + std::span line_numbers) override; + + void insert_file_pids( + int file_id, const std::unordered_set& pids) override; + + void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx, + std::string_view dimension, + std::span blob_data, + std::uint64_t num_entries) override; + + void insert_file_bloom_filter(int file_id, std::string_view dimension, + std::span blob_data, + std::uint64_t num_entries) override; + + void insert_chunk_statistics(int file_id, std::uint64_t checkpoint_idx, + const ChunkStatistics& stats) override; + + void insert_file_scalar_stats(int file_id, const ChunkStatistics& stats, + std::uint64_t num_chunks) override; + + void insert_file_category_counts( + int file_id, const StringViewMap& counts) override; + + void insert_file_pid_tid_counts( + int file_id, const StringViewMap& counts) override; + + void insert_file_name_counts( + int file_id, const StringViewMap& counts) override; + + void insert_index_dimension(int file_id, + std::string_view dimension) override; + + void insert_chunk_dimension_stats( + int file_id, std::uint64_t checkpoint_idx, + const ChunkDimensionStats& stats, + std::size_t value_counts_cap = 4096) override; + + void insert_name_dictionary_entry(std::uint64_t name_id, + std::string_view name) override; + + void insert_name_file_posting(std::uint64_t name_id, int file_id) override; + + void insert_name_chunk_posting(std::uint64_t name_id, int file_id, + std::uint64_t checkpoint_idx) override; + + void insert_hash_table_entry(std::uint8_t type, std::string_view hash, + std::string_view name) override; + + void insert_aggregation_merge(std::string_view key, + std::string_view operand) override; + + void insert_aggregation_put(std::string_view key, + std::string_view value) override; + + void insert_system_metrics_merge(std::string_view key, + std::string_view operand) override; + + /// Aggregation / system_metrics buffers hold mixed Put+Merge entries + /// in one CF. `is_merge` distinguishes them at emit time so the SST + /// records the right operation kind (rocksdb supports mixed-op SSTs). + struct MergeableKeyValue { + std::string key; + std::string value; + bool is_merge = true; + }; + + /// Sort buffers, emit one SST per non-empty column family, return the + /// resulting paths. Calling twice or after a move is a no-op. + Artifacts commit(); + + private: + using KeyValue = std::pair; + + std::string staging_dir_; + std::string batch_id_; + bool committed_ = false; + + std::vector metadata_buf_; + std::vector checkpoints_buf_; + std::vector manifest_buf_; + std::vector chunk_bloom_buf_; + std::vector file_bloom_buf_; + std::vector chunk_stats_buf_; + std::vector chunk_dim_stats_buf_; + std::vector dimensions_buf_; + std::vector file_scalar_stats_buf_; + std::vector file_cat_counts_buf_; + std::vector file_pid_tid_counts_buf_; + std::vector file_name_counts_buf_; + std::vector name_dictionary_buf_; + std::vector name_file_postings_buf_; + std::vector name_chunk_postings_buf_; + std::vector hash_tables_buf_; + std::vector aggregation_buf_; + std::vector system_metrics_buf_; +}; + +/// Thread-safe collector for SST artifacts produced by many concurrent +/// `IndexDatabaseSstWriterContext` instances. The coordinator hands the +/// populated registry to `IndexDatabase::bulk_ingest()`. +class SstArtifactRegistry { + public: + void append(IndexDatabaseSstWriterContext::Artifacts artifacts) { + std::lock_guard lock(mutex_); + auto move_into = [](std::vector& dst, + std::optional& src) { + if (src) dst.push_back(std::move(*src)); + }; + move_into(metadata_, artifacts.metadata_sst); + move_into(checkpoints_, artifacts.checkpoints_sst); + move_into(manifest_, artifacts.manifest_sst); + move_into(chunk_bloom_, artifacts.chunk_bloom_sst); + move_into(file_bloom_, artifacts.file_bloom_sst); + move_into(chunk_stats_, artifacts.chunk_stats_sst); + move_into(chunk_dim_stats_, artifacts.chunk_dim_stats_sst); + move_into(dimensions_, artifacts.dimensions_sst); + move_into(file_scalar_stats_, artifacts.file_scalar_stats_sst); + move_into(file_cat_counts_, artifacts.file_cat_counts_sst); + move_into(file_pid_tid_counts_, artifacts.file_pid_tid_counts_sst); + move_into(file_name_counts_, artifacts.file_name_counts_sst); + move_into(name_dictionary_, artifacts.name_dictionary_sst); + move_into(name_file_postings_, artifacts.name_file_postings_sst); + move_into(name_chunk_postings_, artifacts.name_chunk_postings_sst); + move_into(hash_tables_, artifacts.hash_tables_sst); + move_into(aggregation_, artifacts.aggregation_sst); + move_into(system_metrics_, artifacts.system_metrics_sst); + } + + const std::vector& metadata() const { return metadata_; } + const std::vector& checkpoints() const { return checkpoints_; } + const std::vector& manifest() const { return manifest_; } + const std::vector& chunk_bloom() const { return chunk_bloom_; } + const std::vector& file_bloom() const { return file_bloom_; } + const std::vector& chunk_stats() const { return chunk_stats_; } + const std::vector& chunk_dim_stats() const { + return chunk_dim_stats_; + } + const std::vector& dimensions() const { return dimensions_; } + const std::vector& file_scalar_stats() const { + return file_scalar_stats_; + } + const std::vector& file_cat_counts() const { + return file_cat_counts_; + } + const std::vector& file_pid_tid_counts() const { + return file_pid_tid_counts_; + } + const std::vector& file_name_counts() const { + return file_name_counts_; + } + const std::vector& name_dictionary() const { + return name_dictionary_; + } + const std::vector& name_file_postings() const { + return name_file_postings_; + } + const std::vector& name_chunk_postings() const { + return name_chunk_postings_; + } + const std::vector& hash_tables() const { return hash_tables_; } + const std::vector& aggregation() const { return aggregation_; } + const std::vector& system_metrics() const { + return system_metrics_; + } + + private: + std::mutex mutex_; + std::vector metadata_; + std::vector checkpoints_; + std::vector manifest_; + std::vector chunk_bloom_; + std::vector file_bloom_; + std::vector chunk_stats_; + std::vector chunk_dim_stats_; + std::vector dimensions_; + std::vector file_scalar_stats_; + std::vector file_cat_counts_; + std::vector file_pid_tid_counts_; + std::vector file_name_counts_; + std::vector name_dictionary_; + std::vector name_file_postings_; + std::vector name_chunk_postings_; + std::vector hash_tables_; + std::vector aggregation_; + std::vector system_metrics_; +}; + +} // namespace dftracer::utils::utilities::indexer + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_SST_WRITER_CONTEXT_H diff --git a/include/dftracer/utils/utilities/indexer/index_database_writer_context.h b/include/dftracer/utils/utilities/indexer/index_database_writer_context.h new file mode 100644 index 00000000..2b70f822 --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/index_database_writer_context.h @@ -0,0 +1,184 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_WRITER_CONTEXT_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_WRITER_CONTEXT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer { + +class IndexDatabase; + +class IndexDatabaseWriterContext : public IndexBatchSink { + public: + using IndexBatchSink::ChunkDimensionStats; + using IndexBatchSink::ChunkStatistics; + using IndexBatchSink::IndexerCheckpoint; + using IndexBatchSink::insert_event_range; + using IndexBatchSink::insert_metadata_lines; + struct TarFileRecord { + std::string file_name; + std::uint64_t file_size = 0; + std::uint64_t file_mtime = 0; + char typeflag = '\0'; + std::uint64_t data_offset = 0; + std::uint64_t uncompressed_offset = 0; + }; + + IndexDatabaseWriterContext(IndexDatabaseWriterContext&&) noexcept; + IndexDatabaseWriterContext& operator=( + IndexDatabaseWriterContext&&) noexcept; + IndexDatabaseWriterContext(const IndexDatabaseWriterContext&) = delete; + IndexDatabaseWriterContext& operator=(const IndexDatabaseWriterContext&) = + delete; + ~IndexDatabaseWriterContext() override; + + void commit(); + + // Read-through queries (needed by visitors during write) + bool has_file_scalar_stats(int file_id) const; + + // Schema initialisation + void init_schema(); + + // Registry/capability writes + int get_or_create_file_info( + std::string_view path, std::uint64_t file_hash, + IndexFileEntryCapability caps = IndexFileEntryCapability::NONE); + void set_file_capabilities(int file_id, IndexFileEntryCapability caps); + void set_file_capabilities_by_path(std::string_view logical_path, + IndexFileEntryCapability caps); + void add_file_capability(int file_id, IndexFileEntryCapability cap); + + // Metadata + void insert_file_metadata(int file_id, std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size) override; + + // Bloom inserts + void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx, + std::string_view dimension, + std::span blob_data, + std::uint64_t num_entries) override; + + void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx, + std::string_view dimension, + const void* blob_data, int blob_size, + std::uint64_t num_entries); + + void insert_file_bloom_filter(int file_id, std::string_view dimension, + std::span blob_data, + std::uint64_t num_entries) override; + + void insert_file_bloom_filter(int file_id, std::string_view dimension, + const void* blob_data, int blob_size, + std::uint64_t num_entries); + + void insert_chunk_statistics(int file_id, std::uint64_t checkpoint_idx, + const ChunkStatistics& stats) override; + void insert_file_scalar_stats(int file_id, const ChunkStatistics& stats, + std::uint64_t num_chunks) override; + void insert_file_category_counts( + int file_id, const StringViewMap& counts) override; + void insert_file_pid_tid_counts( + int file_id, const StringViewMap& counts) override; + void insert_file_name_counts( + int file_id, const StringViewMap& counts) override; + std::uint64_t get_or_create_name_id(std::string_view name); + void insert_name_dictionary_entry(std::uint64_t name_id, + std::string_view name) override; + void insert_name_file_posting(std::uint64_t name_id, int file_id) override; + void insert_name_chunk_posting(std::uint64_t name_id, int file_id, + std::uint64_t checkpoint_idx) override; + void refresh_root_summaries_after_file_write( + int file_id, const ChunkStatistics& stats, std::uint64_t num_chunks, + bool had_existing_file_summary, std::uint64_t file_lines = 0, + std::uint64_t file_uncompressed_bytes = 0); + void rebuild_root_summaries(); + void insert_checkpoint(int file_id, + const IndexerCheckpoint& checkpoint) override; + + void insert_index_dimension(int file_id, + std::string_view dimension) override; + + /// Insert a hash table entry with bidirectional storage. + /// Forward: [type][hash] -> name (for output resolution) + /// Reverse: [type+4][name] -> hash (for query DSL) + /// Type: 0=FILE, 1=HOST, 2=STRING, 3=PROC + void insert_hash_table_entry(std::uint8_t type, std::string_view hash, + std::string_view name) override; + + // Aggregation / system-metrics CF writes. + void insert_aggregation_merge(std::string_view key, + std::string_view operand) override; + + void insert_aggregation_put(std::string_view key, + std::string_view value) override; + + void insert_system_metrics_merge(std::string_view key, + std::string_view operand) override; + + void insert_chunk_dimension_stats( + int file_id, std::uint64_t checkpoint_idx, + const ChunkDimensionStats& stats, + std::size_t value_counts_cap = 4096) override; + void insert_tar_archive_metadata(int file_id, std::string_view archive_name, + std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size, + std::uint64_t total_files); + void insert_tar_file(int file_id, const TarFileRecord& record); + + // Deletes + void delete_chunk_bloom_filters(int file_id, std::string_view dimension); + void delete_file_bloom_filter(int file_id, std::string_view dimension); + void delete_chunk_statistics(int file_id); + void delete_chunk_dimension_stats(int file_id); + void delete_file_contents(int file_id); + void delete_event_ranges(int file_id); + void delete_metadata_lines(int file_id); + + // Manifest inserts + void insert_event_range( + int file_id, std::uint64_t checkpoint_idx, std::string_view cat, + std::string_view name, + std::span line_numbers) override; + + void insert_metadata_lines( + int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type, + std::span line_numbers) override; + + /// Insert the set of PIDs observed in a file (for distributed aggregation) + void insert_file_pids( + int file_id, const std::unordered_set& pids) override; + + private: + friend class IndexDatabase; + explicit IndexDatabaseWriterContext( + std::shared_ptr db); + + std::shared_ptr db_; + dftracer::utils::rocksdb::RocksDatabase::Batch batch_; + bool committed_ = false; + std::int64_t cached_next_file_id_ = -1; +}; + +} // namespace dftracer::utils::utilities::indexer + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_WRITER_CONTEXT_H diff --git a/include/dftracer/utils/utilities/indexer/index_file_entry_capability.h b/include/dftracer/utils/utilities/indexer/index_file_entry_capability.h new file mode 100644 index 00000000..d7ae0973 --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/index_file_entry_capability.h @@ -0,0 +1,39 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_FILE_ENTRY_CAPABILITY_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_FILE_ENTRY_CAPABILITY_H + +#include + +namespace dftracer::utils::utilities::indexer { + +enum class IndexFileEntryCapability : std::uint8_t { + NONE = 0, + BLOOM = 1 << 0, + MANIFEST = 1 << 1, + FILE_SUMMARY = 1 << 2, + CHECKPOINTS = 1 << 3, + INDEXING_COMPLETE = 1 << 4, +}; + +inline IndexFileEntryCapability operator|(IndexFileEntryCapability a, + IndexFileEntryCapability b) { + return static_cast(static_cast(a) | + static_cast(b)); +} +inline IndexFileEntryCapability operator&(IndexFileEntryCapability a, + IndexFileEntryCapability b) { + return static_cast(static_cast(a) & + static_cast(b)); +} +inline IndexFileEntryCapability& operator|=(IndexFileEntryCapability& a, + IndexFileEntryCapability b) { + a = a | b; + return a; +} +inline bool has_capability(IndexFileEntryCapability caps, + IndexFileEntryCapability flag) { + return (caps & flag) != IndexFileEntryCapability::NONE; +} + +} // namespace dftracer::utils::utilities::indexer + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_FILE_ENTRY_CAPABILITY_H diff --git a/include/dftracer/utils/utilities/indexer/index_types.h b/include/dftracer/utils/utilities/indexer/index_types.h new file mode 100644 index 00000000..845e6ecc --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/index_types.h @@ -0,0 +1,81 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_TYPES_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_TYPES_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer { + +using ChunkBloomResult = composites::dft::indexing::queries::ChunkBloomResult; +using FileBloomResult = composites::dft::indexing::queries::FileBloomResult; +using ChunkStatisticsResult = + composites::dft::indexing::queries::ChunkStatisticsResult; +using TimeBounds = composites::dft::indexing::queries::TimeBounds; +using EventRangeResult = composites::dft::indexing::queries::EventRangeResult; +using MetadataLinesResult = + composites::dft::indexing::queries::MetadataLinesResult; +using ChunkStatistics = composites::dft::indexing::ChunkStatistics; +using ChunkDimensionStats = composites::dft::indexing::ChunkDimensionStats; +using ChunkDimensionStatsResult = + composites::dft::indexing::ChunkDimensionStatsResult; +using IndexerCheckpoint = internal::IndexerCheckpoint; + +struct MergedStatisticsResult { + ChunkStatistics stats; + std::uint64_t num_chunks = 0; +}; + +struct RootStatisticsResult { + ChunkStatistics stats; + std::uint64_t num_chunks = 0; + std::uint64_t num_files = 0; + std::uint64_t total_lines = 0; + std::uint64_t total_uncompressed_bytes = 0; +}; + +struct NameSummaryResult { + StringViewMap counts; + std::uint64_t other_count = 0; + std::uint64_t unique_count = 0; +}; + +struct FileMetadataResult { + std::uint64_t checkpoint_size = 0; + std::uint64_t num_lines = 0; + std::uint64_t max_bytes = 0; +}; + +struct FileRegistryEntry { + int file_id = -1; + IndexFileEntryCapability capabilities = IndexFileEntryCapability::NONE; +}; + +struct TarArchiveMetadata { + std::string archive_name; + std::uint64_t checkpoint_size = 0; + std::uint64_t total_lines = 0; + std::uint64_t total_uc_size = 0; + std::uint64_t total_files = 0; +}; + +struct TarFileRecord { + std::string file_name; + std::uint64_t file_size = 0; + std::uint64_t file_mtime = 0; + char typeflag = '\0'; + std::uint64_t data_offset = 0; + std::uint64_t uncompressed_offset = 0; +}; + +} // namespace dftracer::utils::utilities::indexer + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_TYPES_H diff --git a/include/dftracer/utils/utilities/indexer/index_visitor.h b/include/dftracer/utils/utilities/indexer/index_visitor.h index f7dae313..13d815d0 100644 --- a/include/dftracer/utils/utilities/indexer/index_visitor.h +++ b/include/dftracer/utils/utilities/indexer/index_visitor.h @@ -1,12 +1,21 @@ #ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_VISITOR_H #define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_VISITOR_H +#include + #include +#include +#include +#include #include namespace dftracer::utils::utilities::indexer { -class IndexDatabase; +class IndexDatabaseWriterContext; + +/// Shared buffer for zero-copy line passing. The string_view passed to +/// on_line points into this buffer; storing the shared_ptr keeps it alive. +using SharedLineBuffer = std::shared_ptr; class IndexVisitor { public: @@ -14,11 +23,41 @@ class IndexVisitor { virtual void begin(std::size_t num_checkpoints) = 0; - virtual void on_checkpoint(std::size_t checkpoint_idx) = 0; + virtual coro::CoroTask on_checkpoint(std::size_t checkpoint_idx) = 0; + + virtual coro::CoroTask on_chunk(const char* data, std::size_t len, + std::size_t checkpoint_idx) { + auto buffer = std::make_shared(data, len); + std::size_t pos = 0; + while (pos < len) { + const void* nl = std::memchr(data + pos, '\n', len - pos); + if (!nl) break; + std::size_t end = static_cast(nl) - data; + on_line(std::string_view(buffer->data() + pos, end - pos), buffer, + checkpoint_idx); + pos = end + 1; + } + co_return; + } + + /// Called for each line. The line string_view points into buffer. + /// Implementations that need the data to outlive this call should + /// store the buffer shared_ptr (zero-copy) rather than copying line. + virtual void on_line(std::string_view line, SharedLineBuffer buffer, + std::size_t checkpoint_idx) = 0; + + virtual coro::CoroTask flush() { co_return; } + + /// Cheap hint that drain_pending() should be called to apply + /// backpressure. Default false. Polled after each on_line call. + virtual bool wants_drain() const noexcept { return false; } - virtual void on_line(std::string_view line, std::size_t checkpoint_idx) = 0; + /// Drain accumulated work via async ops (e.g. channel send). Suspends + /// the calling coroutine when downstream is full -- real backpressure + /// without blocking an executor thread. + virtual coro::CoroTask drain_pending() { co_return; } - virtual void finalize(IndexDatabase& db, int file_id) = 0; + virtual void finalize(IndexDatabaseWriterContext& writer, int file_id) = 0; }; } // namespace dftracer::utils::utilities::indexer diff --git a/include/dftracer/utils/utilities/indexer/internal/index_encoding.h b/include/dftracer/utils/utilities/indexer/internal/index_encoding.h new file mode 100644 index 00000000..6f9a7a4f --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/internal/index_encoding.h @@ -0,0 +1,133 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_ENCODING_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_ENCODING_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer::internal::encoding { + +std::string prefix_for_file(int file_id); + +/// DEFAULT-CF key holding the monotonically increasing counter for the next +/// file_id to assign. Used by both `get_or_create_file_info` (single-file +/// path) and `IndexDatabase::reserve_file_id_range` (distributed pre-alloc). +inline constexpr std::string_view NEXT_FILE_ID_KEY = "_next_file_id"; + +std::string metadata_key(int file_id); + +std::string checkpoint_key(int file_id, std::uint64_t uc_offset, + std::uint64_t checkpoint_idx); + +std::string manifest_event_key(int file_id, std::uint64_t checkpoint_idx, + std::string_view cat, std::string_view name); + +std::string manifest_metadata_key(int file_id, std::uint64_t checkpoint_idx, + std::string_view meta_type); + +std::string encode_metadata_record(std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size); + +std::string encode_checkpoint_value(const IndexerCheckpoint& checkpoint); + +std::string encode_event_range_value(std::span lines); + +std::string encode_metadata_value(std::span lines); + +std::string file_pids_key(int file_id); + +std::string encode_file_pids_value( + const std::unordered_set& pids); + +// Bloom / stats / dimension CFs -------------------------------------------- + +std::string make_dimension_key(int file_id, std::string_view dimension); + +std::string chunk_bloom_key(int file_id, std::string_view dimension, + std::uint64_t checkpoint_idx); + +std::string file_bloom_key(int file_id, std::string_view dimension); + +std::string chunk_stats_key(int file_id, std::uint64_t checkpoint_idx); + +std::string file_scalar_stats_key(int file_id); +std::string file_category_counts_key(int file_id); +std::string file_pid_tid_counts_key(int file_id); +std::string file_name_counts_key(int file_id); + +std::string chunk_dim_stats_key(int file_id, std::uint64_t checkpoint_idx, + std::string_view dimension); + +// Name dictionary + postings (name_id is a 64-bit FNV1a hash of the name) --- + +std::string name_lookup_key(std::string_view name); +std::string name_reverse_key(std::uint64_t name_id); + +std::string name_file_posting_key(std::uint64_t name_id, int file_id); +std::string name_file_owner_key(int file_id, std::uint64_t name_id); +std::string name_file_owner_prefix(int file_id); + +std::string name_chunk_posting_key(std::uint64_t name_id, int file_id, + std::uint64_t checkpoint_idx); +std::string name_chunk_owner_key(int file_id, std::uint64_t name_id, + std::uint64_t checkpoint_idx); +std::string name_chunk_owner_prefix(int file_id); + +// Hash tables (content-addressed) ------------------------------------------ + +std::string hash_table_forward_key(std::uint8_t type, std::string_view hash); +std::string hash_table_reverse_key(std::uint8_t type, std::string_view name); + +std::string encode_bloom_value(std::span blob, + std::uint64_t num_entries); + +std::string encode_chunk_statistics_value( + const composites::dft::indexing::ChunkStatistics& stats); + +std::string encode_chunk_dimension_stats_value( + const composites::dft::indexing::ChunkDimensionStats& stats, + std::size_t value_counts_cap); + +// Count map and name summary encoders are templated so they can accept any +// map type exposing string keys and uint64 values. +template +std::string encode_count_map_value(const Map& counts) { + std::string value; + dftracer::utils::rocksdb::KeyCodec::append_be32( + value, static_cast(counts.size())); + for (const auto& [key, count] : counts) { + append_string(value, key); + append_u64(value, count); + } + return value; +} + +template +std::string encode_name_summary_value(const Map& counts, + std::uint64_t other_count, + std::uint64_t unique_count) { + std::string value; + dftracer::utils::rocksdb::KeyCodec::append_be32( + value, static_cast(counts.size())); + append_u64(value, other_count); + append_u64(value, unique_count); + for (const auto& [key, count] : counts) { + append_string(value, key); + append_u64(value, count); + } + return value; +} + +} // namespace dftracer::utils::utilities::indexer::internal::encoding + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_ENCODING_H diff --git a/include/dftracer/utils/utilities/indexer/internal/payload_codec.h b/include/dftracer/utils/utilities/indexer/internal/payload_codec.h new file mode 100644 index 00000000..ea244c3a --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/internal/payload_codec.h @@ -0,0 +1,140 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_PAYLOAD_CODEC_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_PAYLOAD_CODEC_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer::internal { + +inline constexpr std::size_t DECODE_CONTEXT_BUF_SIZE = 256; +inline thread_local char g_decode_context[DECODE_CONTEXT_BUF_SIZE] = {}; + +struct DecodeContextGuard { + template + explicit DecodeContextGuard(const char* fmt, Args... args) { + std::memcpy(previous_, g_decode_context, DECODE_CONTEXT_BUF_SIZE); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-nonliteral" + std::snprintf(g_decode_context, DECODE_CONTEXT_BUF_SIZE, fmt, args...); +#pragma GCC diagnostic pop + } + + ~DecodeContextGuard() { + std::memcpy(g_decode_context, previous_, DECODE_CONTEXT_BUF_SIZE); + } + + DecodeContextGuard(const DecodeContextGuard&) = delete; + DecodeContextGuard& operator=(const DecodeContextGuard&) = delete; + + private: + char previous_[DECODE_CONTEXT_BUF_SIZE]; +}; + +inline void append_u8(std::string& out, std::uint8_t value) { + out.push_back(static_cast(value)); +} + +inline void append_u32(std::string& out, std::uint32_t value) { + dftracer::utils::rocksdb::KeyCodec::append_be32(out, value); +} + +inline void append_u64(std::string& out, std::uint64_t value) { + dftracer::utils::rocksdb::KeyCodec::append_be64(out, value); +} + +inline void append_i64(std::string& out, std::int64_t value) { + dftracer::utils::rocksdb::KeyCodec::append_be64( + out, static_cast(value)); +} + +inline void append_double(std::string& out, double value) { + static_assert(sizeof(double) == sizeof(std::uint64_t)); + std::uint64_t bits = 0; + std::memcpy(&bits, &value, sizeof(bits)); + append_u64(out, bits); +} + +inline void append_string(std::string& out, std::string_view value) { + append_u32(out, static_cast(value.size())); + out.append(value.data(), value.size()); +} + +inline void append_blob(std::string& out, std::span blob) { + append_u32(out, static_cast(blob.size())); + out.append(reinterpret_cast(blob.data()), blob.size()); +} + +class Cursor { + public: + explicit Cursor(std::string_view data) : data_(data) {} + + std::uint8_t u8() { return static_cast(take(1)[0]); } + + std::uint32_t u32() { + return dftracer::utils::rocksdb::KeyCodec::decode_be32(take(4)); + } + + std::uint64_t u64() { + return dftracer::utils::rocksdb::KeyCodec::decode_be64(take(8)); + } + + std::int64_t i64() { return static_cast(u64()); } + + double f64() { + std::uint64_t bits = u64(); + double value = 0.0; + std::memcpy(&value, &bits, sizeof(value)); + return value; + } + + std::string_view str_view() { + auto len = static_cast(u32()); + return take(len); + } + + std::string str() { + auto bytes = str_view(); + return std::string(bytes.data(), bytes.size()); + } + + std::vector blob() { + auto len = static_cast(u32()); + auto bytes = take(len); + return std::vector(bytes.begin(), bytes.end()); + } + + std::size_t offset() const { return offset_; } + bool eof() const { return offset_ >= data_.size(); } + + private: + std::string_view take(std::size_t len) { + if (offset_ + len > data_.size()) { + char err[DECODE_CONTEXT_BUF_SIZE + 64]; + if (g_decode_context[0] != '\0') { + std::snprintf(err, sizeof(err), "Corrupt RocksDB payload [%s]", + g_decode_context); + } else { + std::snprintf(err, sizeof(err), "Corrupt RocksDB payload"); + } + throw std::runtime_error(err); + } + auto chunk = data_.substr(offset_, len); + offset_ += len; + return chunk; + } + + std::string_view data_; + std::size_t offset_ = 0; +}; + +} // namespace dftracer::utils::utilities::indexer::internal + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_PAYLOAD_CODEC_H diff --git a/include/dftracer/utils/utilities/indexer/internal/statistics_codec.h b/include/dftracer/utils/utilities/indexer/internal/statistics_codec.h new file mode 100644 index 00000000..003d5ea0 --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/internal/statistics_codec.h @@ -0,0 +1,26 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_STATISTICS_CODEC_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_STATISTICS_CODEC_H + +#include + +#include +#include +#include + +namespace dftracer::utils::utilities::indexer::internal { + +std::string encode_file_scalar_stats_value(const ChunkStatistics& stats, + std::uint64_t num_chunks); + +std::string encode_root_scalar_stats_value( + const ChunkStatistics& stats, std::uint64_t num_chunks, + std::uint64_t num_files, std::uint64_t total_lines = 0, + std::uint64_t total_uncompressed_bytes = 0); + +MergedStatisticsResult decode_file_scalar_stats_value(std::string_view value); + +RootStatisticsResult decode_root_scalar_stats_value(std::string_view value); + +} // namespace dftracer::utils::utilities::indexer::internal + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_STATISTICS_CODEC_H diff --git a/include/dftracer/utils/utilities/indexer/provenance_database.h b/include/dftracer/utils/utilities/indexer/provenance_database.h index 974eb771..2353f552 100644 --- a/include/dftracer/utils/utilities/indexer/provenance_database.h +++ b/include/dftracer/utils/utilities/indexer/provenance_database.h @@ -71,8 +71,8 @@ class ProvenanceDatabase { std::string_view predicate); void insert_segment(int file_info_id, int source_idx, int source_checkpoint, - int output_line_start, int output_line_end, - int event_count); + int segment_seq, int output_line_start, + int output_line_end, int event_count); // ----------------------------------------------------------------------- // Provenance query operations diff --git a/include/dftracer/utils/utilities/indexer/visitors/bloom_visitor.h b/include/dftracer/utils/utilities/indexer/visitors/bloom_visitor.h deleted file mode 100644 index ee450aed..00000000 --- a/include/dftracer/utils/utilities/indexer/visitors/bloom_visitor.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_BLOOM_VISITOR_H -#define DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_BLOOM_VISITOR_H - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -namespace dftracer::utils::utilities::indexer { - -class BloomVisitor : public IndexVisitor { - public: - using BloomFilterMap = std::unordered_map< - std::string, - dftracer::utils::utilities::composites::dft::indexing::BloomFilter>; - using HashResolutions = - dftracer::utils::utilities::composites::dft::indexing::HashResolutions; - using ChunkStatistics = - dftracer::utils::utilities::composites::dft::indexing::ChunkStatistics; - using ChunkDimensionStats = dftracer::utils::utilities::composites::dft:: - indexing::ChunkDimensionStats; - using ChunkIndexerConfig = dftracer::utils::utilities::composites::dft:: - indexing::ChunkIndexerConfig; - - struct ChunkState { - BloomFilterMap bloom_filters; - ChunkStatistics statistics; - HashResolutions hash_resolutions; - std::unordered_map dimension_stats; - std::size_t events_processed = 0; - }; - - BloomVisitor(ChunkIndexerConfig config, - std::vector dimensions); - - void begin(std::size_t num_checkpoints) override; - void on_checkpoint(std::size_t checkpoint_idx) override; - void on_line(std::string_view line, std::size_t checkpoint_idx) override; - void finalize(IndexDatabase& db, int file_id) override; - - std::size_t num_chunks() const { return chunks_.size(); } - - private: - void ensure_chunk(std::size_t checkpoint_idx); - - ChunkIndexerConfig config_; - std::vector dimensions_; - std::vector chunks_; - - std::array yy_buf_{}; - yyjson_alc yy_alc_{}; - bool yy_alc_initialized_ = false; -}; - -} // namespace dftracer::utils::utilities::indexer - -#endif // DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_BLOOM_VISITOR_H diff --git a/include/dftracer/utils/utilities/indexer/visitors/manifest_visitor.h b/include/dftracer/utils/utilities/indexer/visitors/manifest_visitor.h deleted file mode 100644 index 4458e6ce..00000000 --- a/include/dftracer/utils/utilities/indexer/visitors/manifest_visitor.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_MANIFEST_VISITOR_H -#define DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_MANIFEST_VISITOR_H - -#include - -#include -#include -#include -#include -#include -#include - -namespace dftracer::utils::utilities::indexer { - -class ManifestVisitor : public IndexVisitor { - public: - ManifestVisitor() = default; - - void begin(std::size_t num_checkpoints) override; - void on_checkpoint(std::size_t checkpoint_idx) override; - void on_line(std::string_view line, std::size_t checkpoint_idx) override; - void finalize(IndexDatabase& db, int file_id) override; - - private: - void ensure_chunk(std::size_t checkpoint_idx); - - using EventKey = std::pair; - using LineVec = std::vector; - - std::vector> event_lines_; - std::vector> metadata_lines_; - std::uint32_t chunk_line_ = 0; -}; - -} // namespace dftracer::utils::utilities::indexer - -#endif // DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_MANIFEST_VISITOR_H diff --git a/include/dftracer/utils/utilities/reader/internal/stream_config.h b/include/dftracer/utils/utilities/reader/internal/stream_config.h index 188e063f..85416302 100644 --- a/include/dftracer/utils/utilities/reader/internal/stream_config.h +++ b/include/dftracer/utils/utilities/reader/internal/stream_config.h @@ -106,6 +106,12 @@ class StreamConfig { end_(end), buffer_size_(buffer_size) {} + bool extend_to_line_boundary() const { return extend_to_line_boundary_; } + StreamConfig& extend_to_line_boundary(bool v) { + extend_to_line_boundary_ = v; + return *this; + } + static constexpr std::size_t DEFAULT_BUFFER_SIZE = 4 * 1024 * 1024; // 4MB // ======================================================================== // Fluent API - Basic Setters @@ -226,6 +232,8 @@ class StreamConfig { * Larger buffers improve I/O performance but use more memory. */ std::size_t buffer_size_ = 4 * 1024 * 1024; // 4MB default + + bool extend_to_line_boundary_ = false; }; } // namespace dftracer::utils::utilities::reader::internal diff --git a/include/dftracer/utils/utilities/reader/trace_reader.h b/include/dftracer/utils/utilities/reader/trace_reader.h index ccd2341a..297899db 100644 --- a/include/dftracer/utils/utilities/reader/trace_reader.h +++ b/include/dftracer/utils/utilities/reader/trace_reader.h @@ -2,11 +2,16 @@ #define DFTRACER_UTILS_UTILITIES_READER_TRACE_READER_H #include +#include #include #include +#include #include #include #include +#ifdef DFTRACER_UTILS_ENABLE_ARROW +#include +#endif #include #include @@ -15,17 +20,21 @@ namespace dftracer::utils::utilities::reader { +using common::json::JsonParser; using fileio::lines::Line; +struct JsonLine { + std::string_view content; + std::size_t line_number; + JsonParser* parser; +}; + /// File-level configuration for TraceReader. struct TraceReaderConfig { std::string file_path; ///< Path to trace file (.pfw.gz or plain). std::string index_dir; ///< Directory containing `.dftindex` roots. std::size_t checkpoint_size = 32 * 1024 * 1024; ///< Checkpoint interval. bool auto_build_index = false; ///< Auto-build index if missing. - std::size_t index_threshold = - constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; ///< Min size for - ///< auto-index. }; /// Per-read configuration for range, buffering, and query filtering. @@ -42,9 +51,30 @@ struct ReadConfig { /// Query DSL string for event filtering (empty = no filter). /// When set and an index exists, chunk pruning skips non-matching - /// chunks. Per-event filtering always applies. + /// chunks. Per-event filtering always applies unless chunk_prune_only + /// is set. std::string query; + /// When true, the query is used only for chunk-level pruning via + /// the index. Per-line filtering is skipped (caller handles it). + bool chunk_prune_only = false; + + /// When true, the reader skips its own chunk pruner pass entirely and + /// trusts the caller's start_line/end_line window. Intended for the + /// checkpoint-level work-item dispatcher, which already pruned once + /// per file at enumeration time. Without this the pruner would + /// re-run per work item (hundreds-of-thousands of RocksDB opens). + bool skip_pruning = false; + + bool start_at_checkpoint = false; + bool end_at_checkpoint = false; + + /// When true, top-level object values (e.g. `args`) are expanded one + /// level into `parent.child` columns with native Arrow types instead + /// of being serialized as a JSON string column. One-level only; deeper + /// nesting still round-trips as JSON text under the flattened key. + bool flatten_objects = false; + bool has_line_range() const { return start_line > 0 || end_line > 0; } bool has_byte_range() const { return start_byte > 0 || end_byte > 0; } }; @@ -58,10 +88,25 @@ class TraceReader { /// Read lines with optional query filtering and chunk pruning. coro::AsyncGenerator read_lines(ReadConfig config = {}); + /// Read parsed JSON lines. Parses each line once with simdjson ondemand, + /// applies query filtering, and yields the parsed document. + /// The yielded JsonParser is valid until the next next() call. + coro::AsyncGenerator read_json(ReadConfig config = {}); + /// Read raw byte chunks. coro::AsyncGenerator> read_raw( ReadConfig config = {}); +#ifdef DFTRACER_UTILS_ENABLE_ARROW + /// Direct Arrow batch pipeline: chunk-prune + line-level prefilter + + /// simdjson iterate_many + inline row build. Yields complete Arrow + /// record batches sized at `batch_size` rows. Emits the final + /// partial batch on generator close. Non-normalized schema only + /// (dynamic columns follow the first row seen). + coro::AsyncGenerator read_arrow( + ReadConfig config = {}, std::size_t batch_size = 10000); +#endif + /// True if a `.dftindex` database was found at construction time. bool has_index() const; /// Decompressed size (0 if no index for compressed files). diff --git a/include/dftracer/utils/utilities/replay/replay.h b/include/dftracer/utils/utilities/replay/replay.h index fc2383c7..988dfec6 100644 --- a/include/dftracer/utils/utilities/replay/replay.h +++ b/include/dftracer/utils/utilities/replay/replay.h @@ -2,8 +2,10 @@ #define DFTRACER_UTILS_UTILITIES_REPLAY_REPLAY_H #include -#include -#include +#include +#include +#include +#include #include #include @@ -90,6 +92,13 @@ struct ReplayConfig { // MPI options int mpi_rank = 0; // MPI rank of this process int mpi_size = 1; // Total number of MPI processes + + // Optional observation hook fired in dispatch_trace after apply_timing + // returns and before the executor runs. Used by fidelity tests to + // measure dispatch lateness vs. the trace's wall-clock anchor; left + // unset in production so the per-event branch is the only cost. + std::function + on_dispatch; }; /** @@ -102,8 +111,9 @@ struct ReplayResult { std::size_t failed_events = 0; std::chrono::microseconds total_duration{0}; std::chrono::microseconds execution_duration{0}; - std::unordered_map function_counts; - std::unordered_map category_counts; + // Keys are non-owning views into the replay StringIntern pool. + std::unordered_map function_counts; + std::unordered_map category_counts; std::vector error_messages; // Extended statistics @@ -164,7 +174,10 @@ class PosixExecutor : public TraceExecutor { std::string get_name() const override { return "POSIX"; } private: - std::unordered_map open_files_; + // Keys are interned via the replay StringIntern pool. + std::unordered_map open_files_; + // Scratch buffer reused across reads and writes. + std::vector io_buffer_; bool execute_open(const Trace& trace, const ReplayConfig& config); bool execute_close(const Trace& trace, const ReplayConfig& config); @@ -172,6 +185,9 @@ class PosixExecutor : public TraceExecutor { bool execute_write(const Trace& trace, const ReplayConfig& config); bool execute_seek(const Trace& trace, const ReplayConfig& config); bool execute_stat(const Trace& trace, const ReplayConfig& config); + + // Ensure io_buffer_ has at least `size` bytes; grow with 'A' fill. + void ensure_io_buffer(std::size_t size); }; /** @@ -188,9 +204,6 @@ class DFTracerExecutor : public TraceExecutor { void sleep_for_duration(double duration_microseconds); }; -// Forward declaration -class ReplayLineProcessor; - /** * Main replay engine that coordinates trace reading and execution * @@ -205,8 +218,6 @@ class ReplayLineProcessor; * result.print_summary(); */ class ReplayEngine { - friend class ReplayLineProcessor; - public: /** * Construct replay engine with configuration @@ -255,6 +266,43 @@ class ReplayEngine { ReplayResult replay_with_call_tree(const std::string& trace_dir, const std::string& pattern = "*.pfw.gz"); + /** + * Process a single trace event already loaded into a JsonParser. + * Public so callers driving their own TraceReader::read_json loop can + * feed events in directly without going through replay(file). + */ + bool process_trace_line( + dftracer::utils::utilities::common::json::JsonParser& parser, + ReplayResult& result); + + /** + * Stream parsed Trace events from the given trace files. Drives + * TraceReader::read_json under the hood; each call yields one event. + * Used to plug replay into a producer task inside a Pipeline. + */ + coro::AsyncGenerator stream_traces( + const std::vector& files); + + /** + * Drive a producer/consumer pipeline that decouples read+parse from + * timing+execute. The producer fills a bounded channel from + * stream_traces; a single consumer drains it and dispatches events + * (apply_timing → executor->execute). Read latency is hidden behind + * the consumer's per-event work + sleep_for, eliminating the + * dispatch lateness that the sequential path accumulates on large + * gz-compressed traces. + * + * @param scope Parent CoroScope (typically a Pipeline task scope). + * @param files Trace files to replay in order. + * @param result Aggregated counts and per-event stats are written + * here. Must outlive the awaited coroutine. + * @param channel_capacity Max in-flight parsed Traces. Default 4096. + */ + coro::CoroTask run_pipelined(dftracer::utils::CoroScope& scope, + const std::vector& files, + ReplayResult& result, + std::size_t channel_capacity = 4096); + private: ReplayConfig config_; std::vector> executors_; @@ -263,14 +311,18 @@ class ReplayEngine { bool first_timestamp_set_ = false; /** - * Process a single trace line (JSON) + * Update result counts and execute one already-parsed Trace. + * Extracted from process_trace_line so the pipeline consumer and the + * sync per-line path share the same dispatch semantics. */ - bool process_trace_line(const std::string& line, ReplayResult& result); + void dispatch_trace(const Trace& trace, ReplayResult& result); /** - * Parse JSON trace into Trace structure + * Populate a Trace from a parsed JsonParser document. */ - bool parse_trace_json(const std::string& json_line, Trace& trace); + bool parse_trace_json( + dftracer::utils::utilities::common::json::JsonParser& parser, + Trace& trace); /** * Apply timing logic before executing trace @@ -314,21 +366,6 @@ class ReplayEngine { ReplayResult& result); }; -/** - * Line processor for handling trace lines during replay - */ -class ReplayLineProcessor - : public dftracer::utils::utilities::reader::internal::LineProcessor { - public: - explicit ReplayLineProcessor(ReplayEngine& engine, ReplayResult& result); - - coro::CoroTask process(const char* data, std::size_t length) override; - - private: - ReplayEngine& engine_; - ReplayResult& result_; -}; - } // namespace dftracer::utils::utilities::replay #endif // DFTRACER_UTILS_UTILITIES_REPLAY_REPLAY_H diff --git a/include/dftracer/utils/utilities/replay/trace.h b/include/dftracer/utils/utilities/replay/trace.h index c148c4f5..c4c80d7c 100644 --- a/include/dftracer/utils/utilities/replay/trace.h +++ b/include/dftracer/utils/utilities/replay/trace.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -34,11 +35,15 @@ using ViewFields = std::unordered_map; * Contains all information needed for replay operations */ struct Trace { - // Category and function identification - std::string cat; // Category (e.g., "posix", "stdio", "h5py") - std::string io_cat; // I/O category (read, write, metadata) - std::string acc_pat; // Access pattern - std::string func_name; // Function name (e.g., "read", "write", "open") + // Category and function identification. + // Short-lived enum-like strings (cat/func_name) and per-event hashes + // (fhash/hhash) are non-owning views into a process-wide StringIntern + // pool; the pool keeps them alive for the program lifetime so the + // views remain valid past the parser that produced them. + std::string_view cat; // Category (e.g., "posix", "stdio", "h5py") + std::string io_cat; // I/O category (read, write, metadata) + std::string acc_pat; // Access pattern + std::string_view func_name; // Function name (e.g., "read", "write") // Timing information double duration; // Duration in microseconds @@ -53,8 +58,8 @@ struct Trace { std::uint64_t tid; // Thread ID // File identification - std::string fhash; // File hash - std::string hhash; // Host hash + std::string_view fhash; // File hash (interned) + std::string_view hhash; // Host hash (interned) std::uint64_t image_id; // Image ID // Trace type diff --git a/pyproject.toml b/pyproject.toml index 5fb51f6e..a1af280f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,20 @@ dev = [ requires = ["scikit-build-core >=0.10"] build-backend = "scikit_build_core.build" +[tool.uv] +cache-keys = [ + { file = "pyproject.toml" }, + { file = "CMakeLists.txt" }, + { file = "CMakePresets.json" }, + { file = "cmake/**/*.cmake" }, + { file = "cmake/modules/**/*.cmake" }, + { file = "include/**/*.h" }, + { file = "src/**/*.h" }, + { file = "src/**/*.cpp" }, + { file = "python/**/*.py" }, + { file = "python/**/*.pyi" }, +] + [tool.scikit-build] metadata.version.provider = "scikit_build_core.metadata.setuptools_scm" minimum-version = "build-system.requires" diff --git a/python/dftracer/utils/__init__.py b/python/dftracer/utils/__init__.py index 25f9b8ae..c95eb0ad 100644 --- a/python/dftracer/utils/__init__.py +++ b/python/dftracer/utils/__init__.py @@ -1,11 +1,11 @@ from importlib.metadata import PackageNotFoundError, version from typing import Optional +from .arrow import read_arrow, write_arrow # noqa: F401 from .dftracer_utils_ext import ( - JSON, # noqa: F401 - Indexer, # noqa: F401 + CheckpointIndexer, # noqa: F401 IndexerCheckpoint, # noqa: F401 - TraceReader, # noqa: F401 + JsonDictValue, # noqa: F401 ) from .dftracer_utils_ext import ( get_default_runtime as _get_default_native_runtime, @@ -13,8 +13,14 @@ from .dftracer_utils_ext import ( set_default_runtime as _set_default_native_runtime, ) +from .indexer import ( # noqa: F401 + AggregationConfig, + Indexer, + IndexStatus, +) from .query import Expr, Field # noqa: F401 from .runtime import Runtime, TaskHandle # noqa: F401 +from .trace_reader import TraceReader # noqa: F401 _default_wrapper: Optional["Runtime"] = None @@ -46,13 +52,19 @@ def set_default_runtime(runtime: Optional["Runtime"]) -> None: __all__ = [ + "AggregationConfig", + "CheckpointIndexer", "Expr", "Field", "Indexer", "IndexerCheckpoint", + "IndexStatus", + "JsonDictValue", "TraceReader", "Runtime", "TaskHandle", "get_default_runtime", + "read_arrow", "set_default_runtime", + "write_arrow", ] diff --git a/python/dftracer/utils/arrow.py b/python/dftracer/utils/arrow.py index e7558e89..37846e33 100644 --- a/python/dftracer/utils/arrow.py +++ b/python/dftracer/utils/arrow.py @@ -1,8 +1,11 @@ -"""Arrow data interchange wrappers for DFTracer. +"""Arrow data interchange and I/O for DFTracer. -Provides ArrowBatch and ArrowTable classes that wrap Arrow C Data Interface -objects (PyCapsules) with convenience methods for conversion to pandas and -polars DataFrames. +Provides: +- ArrowBatch and ArrowTable classes that wrap Arrow C Data Interface + objects (PyCapsules) with convenience methods for conversion to pandas + and polars DataFrames. +- write_arrow() and read_arrow() for Arrow IPC file I/O with Runtime + parallelization. These wrappers are pure Python. The actual Arrow data is produced by the C extension (TraceReader.iter_arrow, utility to_arrow methods). Conversion @@ -12,7 +15,36 @@ from __future__ import annotations -from typing import Any, Iterator, Optional, Tuple +import os +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Union + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.ipc as ipc + + from dftracer.utils.dftracer_utils_ext import ( + read_arrow_files_parallel as _cpp_read_parallel, + ) + +_HAS_PYARROW = False +_HAS_CPP_READER = False + +try: + import pyarrow as pa + import pyarrow.ipc as ipc + + _HAS_PYARROW = True +except ImportError: + pass + +try: + from dftracer.utils.dftracer_utils_ext import ( + read_arrow_files_parallel as _cpp_read_parallel, + ) + + _HAS_CPP_READER = True +except ImportError: + pass class ArrowBatch: @@ -98,32 +130,62 @@ def to_polars(self) -> Any: class ArrowTable: """Wrapper around a collection of Arrow RecordBatches. - Returned by read_arrow() and utility process() methods. Holds - multiple batches with a shared schema. Supports the Arrow PyCapsule - stream protocol (__arrow_c_stream__) for zero-copy interchange. + Returned by read_arrow() and utility process() methods. Supports + the Arrow PyCapsule stream protocol (__arrow_c_stream__) for + zero-copy interchange. - The pyarrow Table is cached on first conversion so subsequent calls - to ``to_pandas()``, ``to_polars()``, or ``__arrow_c_stream__()`` - are safe. + Accepts either a pre-built list of batches or a lazy iterator. + When constructed from an iterator, batches are not materialized + until data access (to_pandas, to_polars, batches, etc.). - Empty results (no events matched) return an ArrowTable with - num_batches=0 and no columns. + ``num_rows`` is special: if the iterator has not been consumed yet, + it streams through counting rows without retaining batches (O(1) + memory). After a streaming ``num_rows``, data access methods will + return empty results -- use ``iter_arrow`` directly if you need + both count and data for very large datasets. """ def __init__( self, - batches: list[Any], + batches: Any, schema_capsule: Optional[Any] = None, ) -> None: - self._batches = list(batches) + self._stream: Any = None + if isinstance(batches, list): + self._batches: Optional[list[Any]] = batches + self._iter: Optional[Iterator[Any]] = None + elif hasattr(batches, "__arrow_c_stream__"): + self._batches = None + self._iter = None + self._stream = batches + else: + self._batches = None + self._iter = iter(batches) self._schema_capsule = schema_capsule - self._pa_table: Any = None # cached pyarrow.Table + self._pa_table: Any = None + + def _materialize(self) -> list[Any]: + if self._batches is not None: + return self._batches + if self._stream is not None: + self._to_pa_table() + if self._pa_table is not None: + self._batches = list(self._pa_table.to_batches()) + return self._batches + if self._iter is not None: + self._batches = list(self._iter) + self._iter = None + return self._batches + self._batches = [] + return self._batches def _to_pa_table(self) -> Any: """Convert to pyarrow Table, caching the result. Arrow C Data Interface export is single-use (ownership transfer), - so we cache the pyarrow table on first conversion. + so we cache the pyarrow table on first conversion. After + conversion the batch capsule references are cleared since pyarrow + now owns the underlying buffers. Returns: pyarrow.Table: The converted table. @@ -137,7 +199,13 @@ def _to_pa_table(self) -> Any: import pyarrow as pa except ImportError: raise ImportError("pyarrow is required. Install with: pip install pyarrow") from None - pa_batches = [pa.record_batch(b) for b in self._batches] + if self._stream is not None: + self._pa_table = pa.table(self._stream) + self._stream = None + return self._pa_table + batches = self._materialize() + pa_batches = [pa.record_batch(b) for b in batches] + self._batches = None if not pa_batches: schema = pa.schema([]) if self._schema_capsule is not None: @@ -154,25 +222,29 @@ def __arrow_c_stream__(self, requested_schema: Any = None) -> Any: @property def num_batches(self) -> int: """Number of batches.""" - return len(self._batches) + return len(self._materialize()) @property def num_rows(self) -> int: """Total number of rows across all batches.""" - return sum(b.num_rows for b in self._batches) + if self._pa_table is not None: + return self._pa_table.num_rows + return sum(b.num_rows for b in self._materialize()) @property def empty(self) -> bool: """True if there are no batches.""" - return len(self._batches) == 0 + if self._pa_table is not None: + return self._pa_table.num_rows == 0 + return len(self._materialize()) == 0 def batch(self, i: int) -> Any: """Get the i-th batch.""" - return self._batches[i] + return self._materialize()[i] def batches(self) -> Iterator[Any]: """Iterate over batches.""" - return iter(self._batches) + return iter(self._materialize()) def to_pandas(self) -> Any: """Convert all batches to a single pandas DataFrame. @@ -200,6 +272,203 @@ def to_polars(self) -> Any: raise ImportError( "polars is required for to_polars(). Install with: pip install polars" ) from None - if not self._batches: + table = self._to_pa_table() + if table.num_rows == 0: return pl.DataFrame() - return pl.concat([pl.from_arrow(self._to_pa_table())]) + return pl.from_arrow(table) + + +def write_arrow( + file_path: str, + output_dir: str, + view: Optional[Union[str, Dict]] = None, + index_dir: str = "", + checkpoint_size: int = 32 * 1024 * 1024, + compression: str = "zstd", + batch_size: int = 10000, + chunks: Optional[List[Dict]] = None, + parallel: bool = True, +) -> Dict: + """Write trace data to Arrow IPC files. + + If chunks is provided, writes those specific chunks. Otherwise, gets + all candidate chunks from the file after bloom filter pruning. + + Args: + file_path: Path to the trace file. + output_dir: Directory for output Arrow IPC files. + view: View definition - string ('io', 'compute', 'dlio') or + dict with 'name' and optional 'query'. + index_dir: Directory for index files. + checkpoint_size: Checkpoint size for indexing. + compression: 'zstd' or 'none'. + batch_size: Events per batch. + chunks: Optional list of specific chunks to write. If None, + gets all candidate chunks from the file. + parallel: If True (default), process chunks in parallel via Runtime. + + Returns: + dict with: + - files: List of written Arrow IPC file paths + - total_chunks: Number of chunks processed + - skipped_chunks: Number of chunks skipped by bloom filter + - total_rows: Total rows written + - total_events_matched: Total events matched + + Example: + >>> from dftracer.utils.arrow import write_arrow + >>> result = write_arrow( + ... "trace.pfw.gz", + ... "/output/io_view", + ... view="io", + ... ) + >>> print(f"Wrote {len(result['files'])} files") + """ + from dftracer.utils import TraceReader, get_default_runtime + + os.makedirs(output_dir, exist_ok=True) + + reader = TraceReader(file_path, index_dir=index_dir, checkpoint_size=checkpoint_size) + + if chunks is None: + chunks_result = reader.get_view_chunks(view=view) + + if not chunks_result["file_may_match"]: + return { + "files": [], + "total_chunks": 0, + "skipped_chunks": chunks_result["skipped_checkpoints"], + "total_rows": 0, + "total_events_matched": 0, + } + + chunks = chunks_result["chunks"] + skipped_chunks = chunks_result["skipped_checkpoints"] + else: + skipped_chunks = 0 + + if not chunks: + return { + "files": [], + "total_chunks": 0, + "skipped_chunks": skipped_chunks, + "total_rows": 0, + "total_events_matched": 0, + } + + if parallel and len(chunks) > 1: + runtime = get_default_runtime() + + def write_chunk(chunk: Dict) -> Dict: + r = TraceReader(file_path, index_dir=index_dir, checkpoint_size=checkpoint_size) + return r.write_view_chunks( + chunks=[chunk], + output_dir=output_dir, + view=view, + compression=compression, + batch_size=batch_size, + ) + + handles = [ + runtime.submit(write_chunk, chunk, name=f"write:chunk_{i}") + for i, chunk in enumerate(chunks) + ] + batch_results = [h.get() for h in handles] + else: + result = reader.write_view_chunks( + chunks=chunks, + output_dir=output_dir, + view=view, + compression=compression, + batch_size=batch_size, + ) + batch_results = [result] + + files = [] + total_rows = 0 + total_events_matched = 0 + for br in batch_results: + for r in br.get("results", []): + if r.get("rows_written", 0) > 0: + files.append(r["output_file"]) + total_rows += br.get("total_rows", 0) + total_events_matched += br.get("total_events_matched", 0) + + return { + "files": files, + "total_chunks": len(chunks), + "skipped_chunks": skipped_chunks, + "total_rows": total_rows, + "total_events_matched": total_events_matched, + } + + +def read_arrow( + files: List[str], + parallel: bool = True, +): + """Read Arrow IPC files and return a combined pyarrow Table. + + Uses pyarrow for reading with optional parallelization via Runtime. + Falls back to C++ reader if pyarrow is not available. + + Args: + files: List of Arrow IPC file paths. + parallel: If True (default), read files in parallel using Runtime. + + Returns: + pyarrow.Table with all data combined, or list of batch objects if + pyarrow is not available. + + Example: + >>> from dftracer.utils.arrow import read_arrow + >>> table = read_arrow(["file1.arrow", "file2.arrow"]) + >>> print(f"Read {table.num_rows} rows") + """ + from dftracer.utils import get_default_runtime + + if not files: + return None + + valid_files = [f for f in files if os.path.exists(f) and os.path.getsize(f) > 0] + if not valid_files: + return None + + if _HAS_PYARROW: + + def read_one(path: str) -> pa.Table: + return ipc.open_file(path).read_all() + + if parallel and len(valid_files) > 1: + runtime = get_default_runtime() + handles = [ + runtime.submit(read_one, f, name=f"read:{os.path.basename(f)}") for f in valid_files + ] + tables = [h.get() for h in handles] + else: + tables = [read_one(f) for f in valid_files] + + if not tables: + return None + + return pa.concat_tables(tables) + + if not _HAS_CPP_READER: + raise ImportError( + "Neither pyarrow nor C++ Arrow IPC reader available. " + "Install pyarrow or build dftracer-utils with Arrow IPC support." + ) + + from dftracer.utils import Runtime as PyRuntime + + runtime = get_default_runtime() + native_rt = runtime._native if isinstance(runtime, PyRuntime) else runtime + + result = _cpp_read_parallel(valid_files, runtime=native_rt) + + batches = [] + for fr in result.get("file_results", []): + if fr.get("success"): + batches.extend(fr.get("batches", [])) + + return batches diff --git a/python/dftracer/utils/dask.py b/python/dftracer/utils/dask.py index 36365e20..f48ad108 100644 --- a/python/dftracer/utils/dask.py +++ b/python/dftracer/utils/dask.py @@ -1,32 +1,42 @@ """Dask distributed integration for dftracer-utils.""" -from typing import Any, Optional +import os +from collections import defaultdict +from typing import Any, Dict, List, Optional, Union try: - from dask.distributed import WorkerPlugin + from dask.distributed import Client, WorkerPlugin except ImportError: + Client: Optional[Any] = None WorkerPlugin: Optional[Any] = None -from dftracer.utils import Runtime, get_default_runtime, set_default_runtime +try: + import dask + import dask.dataframe as dd +except ImportError: + dask = None # type: ignore[assignment] # ty: ignore[invalid-assignment] + dd = None # type: ignore[assignment] # ty: ignore[invalid-assignment] + +try: + import pyarrow as pa +except ImportError: + pa = None # type: ignore[assignment] # ty: ignore[invalid-assignment] + +from dftracer.utils import Runtime, TraceReader, get_default_runtime, set_default_runtime +from dftracer.utils.indexer import AggregationConfig, Indexer if WorkerPlugin is not None: class DFTracerUtilsDaskWorkerPlugin(WorkerPlugin): - """Creates a persistent Runtime per Dask worker. - - Usage: - client = Client("scheduler:8786") - client.register_plugin( - DFTracerUtilsDaskWorkerPlugin(threads=48) - ) - """ + """Creates a persistent Runtime per Dask worker.""" - def __init__(self, threads=0): + def __init__(self, threads=0, io_threads=0): self.threads = threads + self.io_threads = io_threads def setup(self, worker): worker._dftracer_prev_runtime = get_default_runtime() - rt = Runtime(threads=self.threads) + rt = Runtime(threads=self.threads, io_threads=self.io_threads) worker.dftracer_utils_runtime = rt set_default_runtime(rt) @@ -35,5 +45,1110 @@ def teardown(self, worker): set_default_runtime(worker._dftracer_prev_runtime) del worker._dftracer_prev_runtime if hasattr(worker, "dftracer_utils_runtime"): - worker.dftracer_utils_runtime.shutdown() + # wait=False: don't block on pending tasks during teardown. + # Dask may be tearing down because of timeout/cancel, and a + # stuck task would hang the worker process indefinitely. + try: + worker.dftracer_utils_runtime.shutdown(wait=False) + except Exception: + pass del worker.dftracer_utils_runtime + + +def _write_arrow_task( + file_path: str, + output_dir: str, + view: Optional[Union[str, Dict]], + index_dir: str, + checkpoint_size: int, + compression: str, + batch_size: int, + chunks: List[Dict], +) -> Dict: + """Task function for writing chunks on a Dask worker.""" + from dftracer.utils.arrow import write_arrow + + return write_arrow( + file_path=file_path, + output_dir=output_dir, + view=view, + index_dir=index_dir, + checkpoint_size=checkpoint_size, + compression=compression, + batch_size=batch_size, + chunks=chunks, + parallel=True, + ) + + +def distributed_write_arrow( + file_path: str, + output_dir: str, + view: Optional[Union[str, Dict]] = None, + index_dir: str = "", + checkpoint_size: int = 32 * 1024 * 1024, + compression: str = "zstd", + batch_size: int = 10000, + chunks_per_task: int = 0, +) -> Dict: + """Write trace data to Arrow IPC files using Dask distributed. + + This function: + 1. Gets candidate chunks after bloom filter pruning (coordinator) + 2. Distributes chunk processing to Dask workers + 3. Each worker writes its chunks to Arrow IPC files + 4. Returns paths to all written files for pyarrow reading + + Args: + file_path: Path to the trace file. + output_dir: Directory for output Arrow IPC files. + view: View definition - string ('io', 'compute', 'dlio') or + dict with 'name' and optional 'query'. + index_dir: Directory for index files. + checkpoint_size: Checkpoint size for indexing. + compression: 'zstd' or 'none'. + batch_size: Events per batch. + chunks_per_task: Number of chunks per Dask task. If 0, uses 1 chunk + per task. Higher values batch chunks per worker, processing + them in parallel on the worker's Runtime thread pool. + + Returns: + dict with: + - files: List of written Arrow IPC file paths + - total_chunks: Number of chunks processed + - skipped_chunks: Number of chunks skipped by bloom filter + - total_rows: Total rows written + - total_events_matched: Total events matched + + Example: + >>> import pyarrow.ipc as ipc + >>> import pyarrow as pa + >>> from dftracer.utils.dask import distributed_write_arrow + >>> + >>> result = distributed_write_arrow( + ... "trace.pfw.gz", + ... "/output/io_view", + ... view="io", + ... chunks_per_task=8, # batch 8 chunks per worker + ... ) + >>> # Read back with pyarrow + >>> tables = [ipc.open_file(f).read_all() for f in result["files"]] + >>> combined = pa.concat_tables(tables) + """ + if dask is None: + raise ImportError("dask is required for distributed_write_arrow") + + os.makedirs(output_dir, exist_ok=True) + + reader = TraceReader(file_path, index_dir=index_dir, checkpoint_size=checkpoint_size) + chunks_result = reader.get_view_chunks(view=view) + + if not chunks_result["file_may_match"]: + return { + "files": [], + "total_chunks": 0, + "skipped_chunks": chunks_result["skipped_checkpoints"], + "total_rows": 0, + "total_events_matched": 0, + } + + chunks = chunks_result["chunks"] + if not chunks: + return { + "files": [], + "total_chunks": 0, + "skipped_chunks": chunks_result["skipped_checkpoints"], + "total_rows": 0, + "total_events_matched": 0, + } + + if chunks_per_task <= 0: + chunks_per_task = 1 + + batches = [chunks[i : i + chunks_per_task] for i in range(0, len(chunks), chunks_per_task)] + + delayed_tasks = [ + dask.delayed(_write_arrow_task)( + file_path, + output_dir, + view, + index_dir, + checkpoint_size, + compression, + batch_size, + batch, + ) + for batch in batches + ] + + batch_results = dask.compute(*delayed_tasks) + + files = [] + total_rows = 0 + total_events_matched = 0 + for br in batch_results: + for r in br.get("results", []): + if r.get("rows_written", 0) > 0: + files.append(r["output_file"]) + total_rows += br.get("total_rows", 0) + total_events_matched += br.get("total_events_matched", 0) + + return { + "files": files, + "total_chunks": len(chunks), + "skipped_chunks": chunks_result["skipped_checkpoints"], + "total_rows": total_rows, + "total_events_matched": total_events_matched, + } + + +def _assign_files_by_pid( + file_pids: Dict[int, set], + n_workers: int, +) -> Dict[int, List[int]]: + """Assign files to workers based on majority PID affinity. + + Files with overlapping PIDs are assigned to the same worker to minimize + cross-worker aggregation during the merge phase. + + Args: + file_pids: Dict mapping file_id to set of PIDs in that file. + n_workers: Number of workers to distribute to. + + Returns: + Dict mapping worker_id to list of file_ids. + """ + if n_workers <= 0: + n_workers = 1 + + # Count PIDs per file and assign to worker by hash(majority_pid) % n_workers + worker_assignments: Dict[int, List[int]] = defaultdict(list) + + for file_id, pids in file_pids.items(): + if not pids: + # No PIDs known, round-robin assignment + worker_id = file_id % n_workers + else: + # Use hash of first PID for deterministic assignment + # Files with same PIDs go to same worker + majority_pid = min(pids) # Use min for determinism + worker_id = hash(majority_pid) % n_workers + worker_assignments[worker_id].append(file_id) + + return dict(worker_assignments) + + +def _aggregate_files_task( + files: List[str], + index_path: str, + time_granularity: float, + time_resolution: float, + data_type: str, +) -> List[bytes]: + """Worker task: read pre-indexed data and return Arrow IPC buffers. + + This runs on a Dask worker. It reads from an already-indexed database + and returns Arrow IPC buffers for the specified files. + + Args: + files: List of trace file paths (used for filtering, not re-indexing). + index_path: Path to the .dftindex store (already built by coordinator). + time_granularity: Output time bucket width in seconds. + time_resolution: Microseconds per output time unit. + data_type: 'events', 'profiles', or 'system'. + + Returns: + List of Arrow IPC buffer bytes. + """ + if not files: + return [] + + # Use existing index (read-only) - coordinator already built it + indexer = Indexer( + files=files, + index_dir=os.path.dirname(index_path) if index_path else "", + require_checkpoint=False, # Don't rebuild + require_bloom=False, + require_manifest=False, + require_aggregation=False, # Already aggregated + force_rebuild=False, + ) + + # Collect Arrow batches as IPC buffers + ipc_buffers = [] + for batch_capsule in indexer.iter_arrow_dfanalyzer( + data_type, + time_granularity=time_granularity, + time_resolution=time_resolution, + ): + batch = pa.record_batch(batch_capsule) + # Serialize to IPC buffer for transfer + sink = pa.BufferOutputStream() + writer = pa.ipc.new_stream(sink, batch.schema) + writer.write_batch(batch) + writer.close() + ipc_buffers.append(sink.getvalue().to_pybytes()) + + return ipc_buffers + + +def _aggregate_files_task_all( + files: List[str], + index_path: str, + time_granularity: float, + time_resolution: float, + query: Optional[str] = None, +) -> Dict[str, List[bytes]]: + """Worker task: read all aggregation types and return Arrow IPC buffers. + + This runs on a Dask worker. It reads from an already-indexed database + and returns Arrow IPC buffers for all types in a single scan. + + Args: + files: List of trace file paths (used for filtering, not re-indexing). + index_path: Path to the .dftindex store (already built by coordinator). + time_granularity: Output time bucket width in seconds. + time_resolution: Microseconds per output time unit. + query: Optional query filter string (e.g., "pid == 1234 or pid == 5678"). + + Returns: + Dict with 'events', 'profiles', 'system' keys, each containing + a list of Arrow IPC buffer bytes. + """ + if not files: + return {"events": [], "profiles": [], "system": []} + + indexer = Indexer( + files=files, + index_dir=os.path.dirname(index_path) if index_path else "", + require_checkpoint=False, + require_bloom=False, + require_manifest=False, + require_aggregation=False, + force_rebuild=False, + ) + + all_batches = indexer.iter_arrow_dfanalyzer_all( + time_granularity=time_granularity, + time_resolution=time_resolution, + query=query, + ) + + # Convert to IPC buffers for each type + result = {} + for data_type in ("events", "profiles", "system"): + ipc_buffers = [] + for batch_capsule in all_batches.get(data_type, []): + batch = pa.record_batch(batch_capsule) + sink = pa.BufferOutputStream() + writer = pa.ipc.new_stream(sink, batch.schema) + writer.write_batch(batch) + writer.close() + ipc_buffers.append(sink.getvalue().to_pybytes()) + result[data_type] = ipc_buffers + + return result + + +def _merge_welford(group): + """Merge mean/variance using parallel Welford algorithm. + + Used for re-aggregating overlapping keys across workers. + """ + n_total = group["count"].sum() + if n_total == 0: + return { + "count": 0, + "time": 0.0, + "size": 0, + "time_min": 0.0, + "time_max": 0.0, + "size_min": 0, + "size_max": 0, + } + + # Sum aggregation for count, time, size + result = { + "count": n_total, + "time": group["time"].sum(), + "size": group["size"].sum(), + "time_min": group["time_min"].min(), + "time_max": group["time_max"].max(), + "size_min": group["size_min"].min(), + "size_max": group["size_max"].max(), + } + return result + + +def distributed_aggregate( + directory: str = "", + files: Optional[List[str]] = None, + client: Optional["Client"] = None, + time_interval_ms: float = 5000.0, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + index_dir: str = "", + data_type: str = "events", +) -> "pa.Table": + """Aggregate trace data using Dask distributed workers. + + This function: + 1. Indexes all files on coordinator to get PID manifests + 2. Assigns files to workers by PID affinity (minimize cross-worker overlap) + 3. Each worker aggregates its files using iter_arrow_dfanalyzer + 4. Gathers partial Arrow tables from workers + 5. Re-aggregates overlapping keys (same PID/time_range across files) + + Args: + directory: Directory containing trace files (.pfw/.pfw.gz). + files: Explicit list of files (alternative to directory). + client: Dask distributed Client. If None, uses dask.delayed locally. + time_interval_ms: Aggregation time bucket in milliseconds. + time_granularity: Output time bucket width in seconds. + time_resolution: Microseconds per output time unit. + index_dir: Directory for index storage. + data_type: Type of data to aggregate - 'events', 'profiles', or 'system'. + + Returns: + PyArrow Table with aggregated data. + + Example: + >>> from dask.distributed import Client + >>> from dftracer.utils.dask import distributed_aggregate + >>> + >>> client = Client("scheduler:8786") + >>> client.register_plugin(DFTracerUtilsDaskWorkerPlugin(threads=48)) + >>> + >>> table = distributed_aggregate( + ... directory="/traces", + ... client=client, + ... time_interval_ms=5000, + ... ) + >>> df = table.to_pandas() + """ + if dask is None: + raise ImportError("dask is required for distributed_aggregate") + if pa is None: + raise ImportError("pyarrow is required for distributed_aggregate") + + # Step 1: Index on coordinator + indexer = Indexer( + directory=directory, + files=files, + index_dir=index_dir, + require_checkpoint=True, + require_bloom=True, + require_manifest=True, + require_aggregation=AggregationConfig( + time_interval_ms=time_interval_ms, + compute_percentiles=False, + ), + force_rebuild=False, + ) + status = indexer.ensure_indexed() + + if status.total_files == 0: + return pa.table({}) + + # For local execution (no client), just use iter_arrow_dfanalyzer directly + # This avoids RocksDB locking issues when running in a single process + if client is None: + all_batches = [] + for batch_capsule in indexer.iter_arrow_dfanalyzer( + data_type, + time_granularity=time_granularity, + time_resolution=time_resolution, + ): + all_batches.append(pa.record_batch(batch_capsule)) + + if not all_batches: + return pa.table({}) + + return pa.Table.from_batches(all_batches) + + # Distributed execution: assign files to workers by PID affinity + all_files = status.ready + status.needs_work + file_id_to_path, file_pids = indexer.query_file_info() + index_path = status.index_path + + # Close indexer before distributing (release RocksDB lock) + indexer.close() + + worker_nthreads = client.nthreads() + n_workers = len(worker_nthreads) or 1 + + all_file_ids = set(file_id_to_path.keys()) + full_file_pids = {fid: file_pids.get(fid, set()) for fid in all_file_ids} + worker_file_ids = _assign_files_by_pid(full_file_pids, n_workers) + + worker_files: Dict[int, List[str]] = {} + for worker_id, fids in worker_file_ids.items(): + worker_files[worker_id] = [file_id_to_path[fid] for fid in fids if fid in file_id_to_path] + + futures = [] + worker_list = list(worker_nthreads.keys()) + for worker_id, wfiles in worker_files.items(): + if not wfiles: + continue + worker_addr = worker_list[worker_id % len(worker_list)] if worker_list else None + future = client.submit( + _aggregate_files_task, + wfiles, + index_path, + time_granularity, + time_resolution, + data_type, + workers=[worker_addr] if worker_addr else None, + pure=False, + ) + futures.append(future) + + # Gather results + all_ipc_buffers = client.gather(futures) + + # Deserialize IPC buffers and combine + all_batches = [] + for ipc_buffers in all_ipc_buffers: + for buf_bytes in ipc_buffers: + reader = pa.ipc.open_stream(pa.BufferReader(buf_bytes)) + for batch in reader: + all_batches.append(batch) + + if not all_batches: + return pa.table({}) + + combined_table = pa.Table.from_batches(all_batches) + + # Step 6: Re-aggregate overlapping keys using Dask DataFrame + # This handles cases where the same (pid, tid, time_range, func_name) appears + # across multiple files assigned to different workers + if data_type == "system": + # System metrics: group by host_hash, time_range + group_cols = ["host_hash", "time_range"] + agg_dict = { + "sys_cpu_iowait_pct": "mean", + "sys_cpu_user_pct": "mean", + "sys_cpu_system_pct": "mean", + "sys_cpu_idle_pct": "mean", + "sys_core_iowait_pct_max": "max", + "sys_core_iowait_pct_p95": "max", + "sys_mem_dirty": "mean", + "sys_mem_cached": "mean", + "sys_mem_available": "mean", + } + else: + # Events/Profiles: group by all key columns + group_cols = [ + "cat", + "func_name", + "pid", + "tid", + "file_hash", + "host_hash", + "time_range", + ] + agg_dict = { + "count": "sum", + "time": "sum", + "size": "sum", + "time_min": "min", + "time_max": "max", + "size_min": "min", + "size_max": "max", + } + + # Check if re-aggregation is needed (more than one file) + if len(all_files) > 1: + df = combined_table.to_pandas() + + # Preserve non-aggregated columns + first_cols = {} + for col in df.columns: + if col not in group_cols and col not in agg_dict: + first_cols[col] = "first" + + agg_dict.update(first_cols) + + # Group and aggregate + result_df = df.groupby(group_cols, as_index=False).agg(agg_dict) + return pa.Table.from_pandas(result_df, preserve_index=False) + + return combined_table + + +def distributed_aggregate_all( + directory: str = "", + files: Optional[List[str]] = None, + client: Optional["Client"] = None, + time_interval_ms: float = 5000.0, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + index_dir: str = "", +) -> Dict[str, "pa.Table"]: + """Aggregate all trace data types in a single scan. + + This is ~3x faster than calling distributed_aggregate separately for + events, profiles, and system because it scans the index only once. + + Args: + directory: Directory containing trace files (.pfw/.pfw.gz). + files: Explicit list of files (alternative to directory). + client: Dask distributed Client. If None, uses local execution. + time_interval_ms: Aggregation time bucket in milliseconds. + time_granularity: Output time bucket width in seconds. + time_resolution: Microseconds per output time unit. + index_dir: Directory for index storage. + + Returns: + Dict with 'events', 'profiles', 'system' keys, each containing a + PyArrow Table with aggregated data. + + Example: + >>> from dftracer.utils.dask import distributed_aggregate_all + >>> tables = distributed_aggregate_all("/traces") + >>> events_df = tables["events"].to_pandas() + >>> profiles_df = tables["profiles"].to_pandas() + """ + if dask is None: + raise ImportError("dask is required for distributed_aggregate_all") + if pa is None: + raise ImportError("pyarrow is required for distributed_aggregate_all") + + # Index on coordinator + indexer = Indexer( + directory=directory, + files=files, + index_dir=index_dir, + require_checkpoint=True, + require_bloom=True, + require_manifest=True, + require_aggregation=AggregationConfig( + time_interval_ms=time_interval_ms, + compute_percentiles=False, + ), + force_rebuild=False, + ) + status = indexer.ensure_indexed() + + if status.total_files == 0: + return {"events": pa.table({}), "profiles": pa.table({}), "system": pa.table({})} + + # Use fused API for local execution + if client is None: + result = indexer.iter_arrow_dfanalyzer_all( + time_granularity=time_granularity, + time_resolution=time_resolution, + ) + + tables = {} + for key in ("events", "profiles", "system"): + batches = [pa.record_batch(cap) for cap in result.get(key, [])] + tables[key] = pa.Table.from_batches(batches) if batches else pa.table({}) + + return tables + + # Distributed execution: assign files to workers by PID affinity + file_id_to_path, file_pids = indexer.query_file_info() + index_path = status.index_path + + # Close indexer before distributing (release RocksDB lock) + indexer.close() + + worker_nthreads = client.nthreads() + n_workers = len(worker_nthreads) or 1 + + all_file_ids = set(file_id_to_path.keys()) + full_file_pids = {fid: file_pids.get(fid, set()) for fid in all_file_ids} + worker_file_ids = _assign_files_by_pid(full_file_pids, n_workers) + + worker_files: Dict[int, List[str]] = {} + worker_pids: Dict[int, set] = {} + for worker_id, fids in worker_file_ids.items(): + worker_files[worker_id] = [file_id_to_path[fid] for fid in fids if fid in file_id_to_path] + pids = set() + for fid in fids: + if fid in file_pids: + pids.update(file_pids[fid]) + worker_pids[worker_id] = pids + + futures = [] + worker_list = list(worker_nthreads.keys()) + for worker_id, wfiles in worker_files.items(): + if not wfiles: + continue + # Build query filter for this worker's PIDs + pids = worker_pids.get(worker_id, set()) + query = None + if pids: + pid_conditions = " or ".join(f"pid == {pid}" for pid in sorted(pids)) + query = f"({pid_conditions})" + worker_addr = worker_list[worker_id % len(worker_list)] if worker_list else None + future = client.submit( + _aggregate_files_task_all, + wfiles, + index_path, + time_granularity, + time_resolution, + query, + workers=[worker_addr] if worker_addr else None, + pure=False, + ) + futures.append(future) + + # Gather results (each is a dict with events/profiles/system) + all_results = client.gather(futures) + + # Collect batches by type + batches_by_type: Dict[str, List] = {"events": [], "profiles": [], "system": []} + for result_dict in all_results: + for data_type in ("events", "profiles", "system"): + for buf_bytes in result_dict.get(data_type, []): + reader = pa.ipc.open_stream(pa.BufferReader(buf_bytes)) + for batch in reader: + batches_by_type[data_type].append(batch) + + tables = {} + for data_type in ("events", "profiles", "system"): + batches = batches_by_type[data_type] + if not batches: + tables[data_type] = pa.table({}) + continue + table = pa.Table.from_batches(batches) + # Unify dictionary columns from different workers to plain strings + for i, field in enumerate(table.schema): + if pa.types.is_dictionary(field.type): + table = table.set_column(i, field.name, table.column(i).cast(pa.string())) + tables[data_type] = table + + return tables + + +# --------------------------------------------------------------------------- +# Distributed index build (SST sink path) +# --------------------------------------------------------------------------- + + +def _build_sst_task( + files: List[str], + file_ids: List[int], + file_slices: Optional[List[Any]], + local_staging: str, + lustre_staging: str, + batch_id: str, + index_dir: str, + checkpoint_size: int, + bloom_dimensions: Optional[List[str]], + build_manifest: bool, + force_rebuild: bool, + parallelism: int, + flush_every_files: int, + aggregation_config: Optional[Any] = None, + enable_det_ids: bool = False, +) -> tuple: + """Dask worker task: build per-worker SSTs and relocate to shared FS. + + Returns ``(artifact_dicts, tracker_blob)``.""" + import logging as _logging + import socket as _socket + import time as _time + + from .dftracer_utils_ext import build_sst_batch, move_artifacts + + _log = _logging.getLogger("dftracer.utils.dask._build_sst_task") + _host = _socket.gethostname() + + t0 = _time.monotonic() + if enable_det_ids: + from .dftracer_utils_ext import enable_aggregation_deterministic_ids + + enable_aggregation_deterministic_ids() + + artifact_dicts, tracker_blob = build_sst_batch( + files, + file_ids, + local_staging, + batch_id, + index_dir, + checkpoint_size, + build_manifest, + force_rebuild, + bloom_dimensions, + parallelism, + flush_every_files, + None, + aggregation_config, + file_slices, + ) + t_build = _time.monotonic() + + n_moved = 0 + if lustre_staging and lustre_staging != local_staging: + # Keep per-sink subdir to avoid aggregation.sst collisions. + base = os.path.join(lustre_staging, batch_id) + relocated: List[Dict[str, Optional[str]]] = [] + for i, d in enumerate(artifact_dicts): + relocated.append(move_artifacts(d, os.path.join(base, f"sub_{i}"))) + artifact_dicts = relocated + n_moved = len(relocated) + t_move = _time.monotonic() + + _log.info( + "build host=%s batch=%s n_files=%d n_slices=%d n_artifacts=%d " + "build=%.2fs move=%.2fs(n=%d) total=%.2fs", + _host, + batch_id, + len(set(file_ids)), + len(files), + len(artifact_dicts), + t_build - t0, + t_move - t_build, + n_moved, + t_move - t0, + ) + return artifact_dicts, tracker_blob + + +def _scan_gzip_members_task(paths: List[str]) -> List[List[tuple]]: + """Worker task: scan gzip member offsets for its file subset.""" + from .dftracer_utils_ext import enumerate_gzip_members + + return enumerate_gzip_members(paths, None) + + +def distributed_index( + directory: str = "", + files: Optional[List[str]] = None, + index_path: str = "", + local_staging: str = "", + lustre_staging: str = "", + client: Optional["Client"] = None, + checkpoint_size: int = 32 * 1024 * 1024, + bloom_dimensions: Optional[List[str]] = None, + build_manifest: bool = True, + force_rebuild: bool = False, + partition: str = "lpt", + rebuild_root_summaries: bool = True, + parallelism_per_worker: int = 0, + flush_every_files: int = 0, + aggregation_config: Optional[Any] = None, +) -> Dict[str, Any]: + """Index a set of trace files using Dask workers writing SSTs in parallel. + + Steps (all O(1) on the coordinator except the fan-out): + 1. Enumerate files + sizes via parallel scan. + 2. LPT bin-pack files into one bucket per Dask worker. + 3. Register all files on the coordinator's IndexDatabase (pre-assigns + file_ids and writes DEFAULT-CF entries once). + 4. Submit one Dask task per non-empty worker that runs the existing + indexer pipeline with an SST sink, writing SSTs to `local_staging` + and (if different) moving them to `lustre_staging`. + 5. Collect artifact dicts into an SstArtifactRegistry; coordinator + calls bulk_ingest + rebuild_root_summaries. + + Args: + directory: Directory containing trace files. + files: Explicit file list (alternative to directory). + index_path: Target .dftindex path (coordinator-writable). + local_staging: Per-worker SST build dir. If equal to lustre_staging, + no post-build move. + lustre_staging: Shared FS dir the coordinator reads SSTs from during + ingest. Must be on the same filesystem as index_path for the + cheapest ingest. + client: Dask distributed Client. None -> run tasks inline. + partition: "lpt" (greedy longest-processing-time bin-pack) or + "round_robin". + rebuild_root_summaries: If True, recompute ROOT_* CFs after ingest. + parallelism_per_worker: 0 -> let the plugin/default Runtime choose + (one coroutine thread per core). + flush_every_files: 0 -> build SSTs once per worker; >0 -> flush + mid-batch to bound peak memory. + + Returns: + dict with total_files, per_worker sizes, index_path, artifact_count. + """ + if dask is None: + raise ImportError("dask is required for distributed_index") + if not index_path: + raise ValueError("index_path is required") + if not local_staging: + raise ValueError("local_staging is required") + if not lustre_staging: + lustre_staging = local_staging + + import logging as _logging + import time as _time + + from .dftracer_utils_ext import ( + IndexDatabase as _IndexDatabase, + ) + from .dftracer_utils_ext import ( + SstArtifactRegistry as _SstArtifactRegistry, + ) + from .dftracer_utils_ext import ( + enumerate_gzip_members as _enumerate_gzip_members, + ) + from .dftracer_utils_ext import ( + plan_work_units as _plan_work_units, + ) + from .dftracer_utils_ext import ( + scan_files as _scan_files, + ) + + _log = _logging.getLogger("dftracer.utils.dask.distributed_index") + if not _log.handlers: + _log.setLevel(_logging.INFO) + + # 1. Enumerate files + sizes. + _t0 = _time.monotonic() + if files is None: + if not directory: + raise ValueError("either directory or files is required") + _log.info("distributed_index: scan_files(%s)", directory) + entries = _scan_files(directory, [".pfw", ".pfw.gz"], True, None) + else: + _log.info("distributed_index: sizing %d pre-listed files", len(files)) + entries = [(p, os.path.getsize(p)) for p in files] + _log.info("distributed_index: scanned %d files in %.1fs", len(entries), _time.monotonic() - _t0) + + if not entries: + return {"total_files": 0, "per_worker": [], "index_path": index_path} + + n_workers = 1 + if client is not None: + n_workers = len(client.nthreads()) or 1 + _log.info("distributed_index: %d workers visible", n_workers) + + all_paths = [p for (p, _) in entries] + + # 2. Register all files once on coordinator (one register_files call; + # file_ids are then parallel to `entries`). + _t1 = _time.monotonic() + _log.info("distributed_index: opening IndexDatabase at %s", index_path) + db = _IndexDatabase(index_path) + db.init_schema() + all_file_ids = db.register_files(all_paths, build_manifest) + _log.info( + "distributed_index: register_files done (%d files, %.1fs)", + len(all_paths), + _time.monotonic() - _t1, + ) + + # 3. SCAN: distribute gzip-member scan across workers (round-robin + # per file_idx). Each worker sends back only its 1/N member maps; + # coordinator stitches into the full map. + _t2 = _time.monotonic() + member_map: List[List[tuple]] = [[] for _ in range(len(entries))] + if client is None: + member_map = list(_enumerate_gzip_members(all_paths, None)) + else: + worker_addrs = list(client.nthreads().keys()) + scan_buckets: List[List[int]] = [[] for _ in range(n_workers)] + for i in range(len(all_paths)): + scan_buckets[i % n_workers].append(i) + scan_futs = [] + scan_idx_lists: List[List[int]] = [] + for w, idxs in enumerate(scan_buckets): + if not idxs: + continue + sub_paths = [all_paths[i] for i in idxs] + target = [worker_addrs[w % len(worker_addrs)]] if worker_addrs else None + scan_idx_lists.append(idxs) + scan_futs.append( + client.submit(_scan_gzip_members_task, sub_paths, workers=target, pure=False) + ) + scan_results = client.gather(scan_futs) + for idxs, res in zip(scan_idx_lists, scan_results): + for i, members in zip(idxs, res): + member_map[i] = list(members) + _log.info( + "distributed_index: gzip-member scan done in %.1fs", + _time.monotonic() - _t2, + ) + + # 4. PLAN: deterministic LPT of work units across workers (mirrors MPI). + _t3 = _time.monotonic() + if partition == "lpt": + per_worker_units = _plan_work_units(member_map, n_workers, 0) + elif partition == "round_robin": + # Whole-file fallback for round_robin (no intra-file slicing). + per_worker_units = [[] for _ in range(n_workers)] + for i, mv in enumerate(member_map): + mlen = max(1, len(mv)) + per_worker_units[i % n_workers].append((i, 0, mlen, 0)) + else: + raise ValueError(f"unknown partition={partition}") + _log.info( + "distributed_index: planned in %.2fs (per-worker units=%s)", + _time.monotonic() - _t3, + [len(u) for u in per_worker_units], + ) + + # 5. BUILD: each worker receives its (paths, file_ids, file_slices) + # parallel lists. A file split across workers appears once per slice. + index_dir = os.path.dirname(index_path.rstrip("/")) + os.makedirs(local_staging, exist_ok=True) + os.makedirs(lustre_staging, exist_ok=True) + + worker_file_lists: List[List[str]] = [] + worker_file_ids: List[List[int]] = [] + worker_slices: List[List[Any]] = [] + CKPT_STRIDE = 1 << 20 + for w, units in enumerate(per_worker_units): + paths_w: List[str] = [] + ids_w: List[int] = [] + slices_w: List[Any] = [] + for file_idx, mb, me, _csz in units: + paths_w.append(all_paths[file_idx]) + ids_w.append(int(all_file_ids[file_idx])) + members = member_map[file_idx] or [(0, 0)] + # Clamp [mb, me) into the actual member vector. plan_work_units + # may have synthesised a single (0, 0) for a non-gzip file; in + # that case mb=0, me=1 and the slice is "whole file". + if me > len(members): + me = len(members) + if mb > me: + mb = me + slices_w.append( + ( + int(mb), + int(me), + int(mb) * CKPT_STRIDE, + bool(mb != 0), + [(int(mo), int(ms)) for (mo, ms) in members], + ) + ) + worker_file_lists.append(paths_w) + worker_file_ids.append(ids_w) + worker_slices.append(slices_w) + + _t_build = _time.monotonic() + worker_ids: List[int] = [] + # Each entry is (artifact_dicts, tracker_blob) returned by _build_sst_task. + worker_results: List[Any] = [] + if client is None: + for w, (paths_w, ids_w, slices_w) in enumerate( + zip(worker_file_lists, worker_file_ids, worker_slices) + ): + if not paths_w: + continue + worker_ids.append(w) + worker_results.append( + _build_sst_task( + paths_w, + ids_w, + slices_w, + local_staging, + lustre_staging, + f"worker_{w}", + index_dir, + checkpoint_size, + bloom_dimensions, + build_manifest, + force_rebuild, + parallelism_per_worker, + flush_every_files, + aggregation_config, + False, + ) + ) + else: + worker_addrs = list(client.nthreads().keys()) + futures = [] + for w, (paths_w, ids_w, slices_w) in enumerate( + zip(worker_file_lists, worker_file_ids, worker_slices) + ): + if not paths_w: + continue + target = [worker_addrs[w % len(worker_addrs)]] if worker_addrs else None + worker_ids.append(w) + futures.append( + client.submit( + _build_sst_task, + paths_w, + ids_w, + slices_w, + local_staging, + lustre_staging, + f"worker_{w}", + index_dir, + checkpoint_size, + bloom_dimensions, + build_manifest, + force_rebuild, + parallelism_per_worker, + flush_every_files, + aggregation_config, + True, + workers=target, + pure=False, + ) + ) + worker_results = client.gather(futures) + _log.info( + "distributed_index: build dispatch+gather done in %.1fs (%d workers)", + _time.monotonic() - _t_build, + len(worker_ids), + ) + + # 5. Bulk-ingest on coordinator (all CFs). + _t_collect = _time.monotonic() + registry = _SstArtifactRegistry() + total_artifacts = 0 + tracker_blobs: List[bytes] = [] + has_aggregation = False + for wres in worker_results: + if isinstance(wres, tuple) and len(wres) == 2: + dicts, tracker_blob = wres + else: + dicts, tracker_blob = wres, b"" + if tracker_blob: + tracker_blobs.append(tracker_blob) + for d in dicts: + registry.append(d) + total_artifacts += 1 + if isinstance(d, dict) and (d.get("aggregation_sst") or d.get("system_metrics_sst")): + has_aggregation = True + _log.info( + "distributed_index: collected %d artifacts in %.2fs", + total_artifacts, + _time.monotonic() - _t_collect, + ) + + _t_ingest = _time.monotonic() + db.bulk_ingest(registry) + _log.info( + "distributed_index: bulk_ingest done in %.1fs (%d artifacts)", + _time.monotonic() - _t_ingest, + total_artifacts, + ) + if rebuild_root_summaries: + _t_root = _time.monotonic() + db.rebuild_root_summaries() + _log.info( + "distributed_index: rebuild_root_summaries done in %.1fs", + _time.monotonic() - _t_root, + ) + + if aggregation_config is not None and has_aggregation: + _t_meta = _time.monotonic() + time_interval_ms = getattr(aggregation_config, "time_interval_ms", 0) or 0 + time_interval_us = int(round(time_interval_ms * 1000.0)) + db.write_agg_global_config(time_interval_us=time_interval_us) + if all_file_ids: + db.write_agg_file_markers(list(all_file_ids)) + if tracker_blobs: + db.write_aggregation_tracker(tracker_blobs) + _log.info( + "distributed_index: agg meta writes done in %.2fs (markers=%d, trackers=%d)", + _time.monotonic() - _t_meta, + len(all_file_ids), + len(tracker_blobs), + ) + + per_worker_file_counts = [len(set(ids)) for ids in worker_file_ids] + return { + "total_files": len(entries), + "per_worker": per_worker_file_counts, + "index_path": index_path, + "artifact_batches": total_artifacts, + } diff --git a/python/dftracer/utils/dftracer_utils_ext.pyi b/python/dftracer/utils/dftracer_utils_ext.pyi index c9324a74..b03572e7 100644 --- a/python/dftracer/utils/dftracer_utils_ext.pyi +++ b/python/dftracer/utils/dftracer_utils_ext.pyi @@ -1,7 +1,44 @@ """Type stubs for dftracer_utils_ext module.""" from types import TracebackType -from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union +from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union + +from .arrow import ArrowTable + +class _ArrowBatchCapsule: + """Internal Arrow batch wrapper implementing __arrow_c_array__ protocol.""" + + @property + def num_rows(self) -> int: ... + @property + def num_columns(self) -> int: ... + def __arrow_c_array__(self, requested_schema: Any = None) -> Tuple[Any, Any]: ... + +class _ArrowBatchStream: + """Zero-iteration Arrow stream backed by the C++ coroutine channel. + + Implements the Arrow C Data Interface stream protocol. Pass directly + to ``pyarrow.RecordBatchReader.from_stream()`` or ``pyarrow.table()``. + Single-use: consuming ``__arrow_c_stream__`` once exhausts the object. + """ + + def __arrow_c_stream__(self, requested_schema: Any = None) -> Any: ... + +class JsonDictValue: + """Zero-copy wrapper over a parsed DFTracer JSON event. + + Supports dict-like access: ``event['name']``, ``event['args']['ret']``. + Call ``.to_dict()`` to materialize a regular Python dict. + """ + + def __getitem__(self, key: str) -> Any: ... + def __len__(self) -> int: ... + def __contains__(self, key: str) -> bool: ... + def keys(self) -> List[str]: ... + def values(self) -> List[Any]: ... + def items(self) -> List[Tuple[str, Any]]: ... + def get(self, key: str, default: Any = None) -> Any: ... + def to_dict(self) -> Dict[str, Any]: ... # ========== INDEXER ========== @@ -17,7 +54,185 @@ class IndexerCheckpoint: num_lines: int class Indexer: - """Indexer for creating and managing root-local ``.dftindex`` stores.""" + """Indexer with resolve/build pattern for tiered indexing.""" + + def __init__( + self, + directory: str = "", + files: Optional[List[str]] = None, + index_dir: str = "", + require_checkpoint: bool = True, + require_bloom: bool = True, + require_manifest: bool = True, + require_aggregation: bool = False, + time_interval_ms: float = 5000.0, + group_keys: Optional[List[str]] = None, + custom_metric_fields: Optional[List[str]] = None, + compute_percentiles: bool = False, + checkpoint_size: int = 32 * 1024 * 1024, + parallelism: int = 0, + force_rebuild: bool = False, + runtime: Optional["Runtime"] = None, + ) -> None: + """Create an indexer for trace files. + + At least one of 'directory' or 'files' must be provided. + + Args: + directory: Path to the directory containing trace files. + files: List of specific file paths to index. + index_dir: Directory for `.dftindex` stores. If empty, uses + directory-local paths. + require_checkpoint: If True, build checkpoint index (tier 1). + require_bloom: If True, build bloom filter data (tier 2). + require_manifest: If True, build manifest data (tier 2). + require_aggregation: If True, build aggregation data (tier 3). + time_interval_ms: Time interval for aggregation in milliseconds. + group_keys: Keys to group by for aggregation. + custom_metric_fields: Custom metric fields for aggregation. + compute_percentiles: If True, compute percentiles during aggregation. + parallelism: Number of parallel indexers. 0 = auto. + force_rebuild: If True, rebuild indices even if they exist. + runtime: Runtime instance for thread pool control. + """ + ... + + def resolve(self) -> Dict[str, Any]: + """Resolve which files need indexing. + + Returns: + Dictionary with 'ready' and 'needs_work' file lists. + """ + ... + + def build(self) -> Dict[str, Any]: + """Build indices for files that need work. + + Returns: + Dictionary with build status and statistics. + """ + ... + + def ensure_indexed(self) -> Dict[str, Any]: + """Ensure all files are indexed by calling resolve then build if needed. + + Returns: + Dictionary with 'ready' and 'needs_work' file lists after indexing. + """ + ... + + def get_checkpoint_indexer(self, file_path: str) -> "CheckpointIndexer": + """Get a checkpoint indexer for a specific file. + + Args: + file_path: Path to the trace file (.pfw/.pfw.gz). + + Returns: + CheckpointIndexer instance for checkpoint-level operations. + """ + ... + + def get_hash_table(self, hash_type: str) -> Dict[str, str]: + """Get hash table mapping hash values to original strings. + + Args: + hash_type: Type of hash table ('file', 'host', or 'string'). + + Returns: + Dict mapping hash strings to original values. + + Raises: + ValueError: If hash_type is not valid. + """ + ... + + def query_file_pids(self, file_id: int) -> set: + """Query PIDs observed in a specific file. + + Args: + file_id: File identifier (0-based index). + + Returns: + Set of PIDs (int) observed in the file. + """ + ... + + def query_all_file_pids(self) -> Dict[int, set]: + """Query all file-to-PIDs mappings. + + Returns: + Dict mapping file_id to set of PIDs observed in that file. + """ + ... + + def query_file_info(self) -> Tuple[Dict[int, str], Dict[int, set]]: + """Query file ID to path mapping and per-file PIDs in one call. + + Returns: + Tuple of (file_id_to_path, file_pids). + """ + ... + + def iter_aggregation(self, type: str = "events", batch_size: int = 10000) -> Iterator[Any]: + """Iterate over aggregation data as Arrow batches. + + Args: + type: 'events', 'profiles', or 'system' + batch_size: Number of entries per batch (default 10000) + + Returns: + Iterator over Arrow batch capsules. + """ + ... + + def iter_arrow_dfanalyzer( + self, + type: str = "events", + batch_size: int = 10000, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + query: Optional[str] = None, + ) -> Iterator[Any]: + """Iterate over aggregation data as dfanalyzer-compatible Arrow batches. + + Args: + type: 'events', 'profiles', or 'system' + batch_size: Number of entries per batch (default 10000) + time_granularity: Bucket width in seconds (default 1.0) + time_resolution: Microseconds per output time unit (default 1e6) + query: Optional query filter (e.g., "pid == 1234 or pid == 5678") + + Returns: + Iterator over Arrow batch capsules with dfanalyzer schema. + """ + ... + + def iter_arrow_dfanalyzer_all( + self, + batch_size: int = 10000, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + query: Optional[str] = None, + group_by: Optional[List[str]] = None, + ) -> Dict[str, List[Any]]: + """Iterate over all aggregation types in a single scan. + + Args: + batch_size: Number of entries per batch (default 10000) + time_granularity: Bucket width in seconds (default 1.0) + time_resolution: Microseconds per output time unit (default 1e6) + query: Optional query filter (e.g., "pid == 1234 or pid == 5678") + group_by: Optional list of columns to group by for coarse in-scan + aggregation. When provided, output schema is reduced to the + requested group columns plus aggregated metrics. + + Returns: + Dict with 'events', 'profiles', 'system' keys containing Arrow batches. + """ + ... + +class CheckpointIndexer: + """Checkpoint indexer for single-file checkpoint-level operations.""" def __init__( self, @@ -27,10 +242,9 @@ class Indexer: force_rebuild: bool = False, build_bloom: bool = False, build_manifest: bool = False, - index_threshold: int = 8388608, runtime: Optional["Runtime"] = None, ) -> None: - """Create an indexer for a gzip file. + """Create a checkpoint indexer for a gzip file. Args: gz_path: Path to the gzip trace file. @@ -40,9 +254,6 @@ class Indexer: force_rebuild: If True, rebuild the index even if it exists. build_bloom: If True, build bloom filter data in the index. build_manifest: If True, build manifest data in the index. - index_threshold: Skip bloom/manifest for files smaller than this - (bytes). Set to 0 to disable the threshold and force - indexing regardless of file size. runtime: Runtime instance for thread pool control. If None, uses the default global Runtime. """ @@ -109,7 +320,7 @@ class Indexer: """Whether manifest data exists in the `.dftindex` store.""" ... - def __enter__(self) -> "Indexer": + def __enter__(self) -> "CheckpointIndexer": """Enter the runtime context for the with statement.""" ... @@ -126,140 +337,6 @@ class Indexer: """ ... -# ========== JSON ========== - -# Type aliases for JSON values -_JSONPrimitive = Union[str, int, float, bool, None] - -class JSON: - """Lazy JSON object that parses on demand using yyjson. - - This implementation provides lazy nested navigation for memory efficiency: - - Nested objects/arrays return JSON wrappers (lazy, no conversion) - - Primitives (str, int, float, bool, None) are converted immediately - - Example: - json_obj = JSON('{"args": {"hhash": "abc"}, "pid": 42}') - args = json_obj["args"] # Returns JSON wrapper (lazy, ~48 bytes) - hhash = args["hhash"] # Returns str (converted) - pid = json_obj["pid"] # Returns int (converted) - """ - - def __init__(self, json_str: str) -> None: - """Create a JSON object from a JSON string. - - The JSON string is stored but not parsed until first access. - """ - ... - - def __contains__(self, key: str) -> bool: - """Check if key exists in JSON object.""" - ... - - def __getitem__(self, key: str) -> Union[_JSONPrimitive, "JSON"]: - """Get value by key, raises KeyError if not found. - - Returns: - - JSON wrapper for nested objects/arrays (lazy evaluation) - - Primitive Python types for values (str, int, float, bool, None) - - Example: - obj["nested_object"] # Returns JSON (lazy wrapper) - obj["string_field"] # Returns str - obj["number_field"] # Returns int or float - """ - ... - - def get( - self, - key: str, - default: Union[_JSONPrimitive, "JSON"] = None, - ) -> Union[_JSONPrimitive, "JSON"]: - """Get value by key with optional default. - - Returns: - - JSON wrapper for nested objects/arrays (lazy evaluation) - - Primitive Python types for values - - default if key not found - """ - ... - - def keys(self) -> List[str]: - """Get all keys from JSON object (only for object types).""" - ... - - def values(self) -> List[Union[_JSONPrimitive, "JSON"]]: - """Get all values from JSON object (only for object types). - - Returns: - - List of values, where nested objects/arrays are JSON wrappers (lazy) - - Primitives are converted to Python types - """ - ... - - def items(self) -> List[Tuple[str, Union[_JSONPrimitive, "JSON"]]]: - """Get all key-value pairs from JSON object (only for object types). - - Returns: - - List of (key, value) tuples - - Nested objects/arrays are JSON wrappers (lazy) - - Primitives are converted to Python types - """ - ... - - def __len__(self) -> int: - """Return the number of key-value pairs in the JSON object. - - Returns 0 if the root is not an object. - """ - ... - - def __bool__(self) -> bool: - """Return True if the JSON object is non-empty, False otherwise. - - Returns: - - True if object has at least one key-value pair - - False if object is empty or root is not an object - """ - ... - - def unwrap(self) -> Union[Dict[str, Any], List[Any], _JSONPrimitive]: - """Unwrap the lazy JSON object into native Python dict/list. - - Unlike lazy access via obj[key], this method fully converts the entire - JSON structure to native Python objects: - - JSON objects -> Python dicts - - JSON arrays -> Python lists - - Primitives -> Python types (str, int, float, bool, None) - - Returns: - Fully converted Python object (dict, list, or primitive) - """ - ... - - def copy(self) -> "JSON": - """Return a shallow copy of the JSON object. - - For subtree wrappers: Creates a new wrapper pointing to the same subtree - For top-level objects: Creates a new JSON object from the same data - - Returns: - New JSON object - """ - ... - - def __str__(self) -> str: - """Return the JSON string representation. - - For top-level objects: returns original JSON string - For subtrees: serializes the subtree to JSON - """ - ... - - def __repr__(self) -> str: - """Return string representation of the object.""" - ... - # ========== TASK HANDLE ========== class TaskHandle: @@ -296,7 +373,7 @@ class Runtime: which adds submit(), Python callable support, and error handling. """ - def __init__(self, threads: int = 0) -> None: ... + def __init__(self, threads: int = 0, io_threads: int = 0) -> None: ... def shutdown(self) -> None: ... def wait_all(self) -> None: ... def get_progress(self) -> Dict[str, Any]: ... @@ -305,6 +382,8 @@ class Runtime: def set_default_task_timeout(self, ms: int = 0) -> None: ... @property def threads(self) -> int: ... + @property + def io_threads(self) -> int: ... def __enter__(self) -> "Runtime": ... def __exit__( self, @@ -323,26 +402,25 @@ class TraceReader: def __init__( self, - file_path: str, + path: str, index_dir: str = "", checkpoint_size: int = 33554432, auto_build_index: bool = False, - index_threshold: int = 8388608, - runtime: Optional[Runtime] = None, + runtime: Optional[Union[Runtime, object]] = None, ) -> None: """Create a TraceReader. Args: - file_path: Path to the trace file (.pfw.gz or plain text). + path: Path to a trace file (.pfw/.pfw.gz) or a directory. + When a directory is given, all iter/read methods discover + .pfw and .pfw.gz files recursively and process them in + parallel on the Runtime thread pool. index_dir: Directory to search for ``.dftindex`` stores. Empty string (default) searches next to the trace file. checkpoint_size: Checkpoint interval in bytes for index building (default 32 MB). auto_build_index: If True, automatically build an index - when none exists and the file exceeds *index_threshold*. - index_threshold: Minimum file size in bytes before - auto-indexing is triggered (default 8 MB). Set to 0 - to disable the threshold and always build an index. + when none exists. runtime: Runtime instance for thread pool control. If None, uses the default global Runtime. @@ -359,7 +437,7 @@ class TraceReader: end_byte: int = 0, buffer_size: int = 4194304, query: Optional[str] = None, - ) -> List[str]: + ) -> List[memoryview]: """Read lines from the trace file and return as a list. Lines are 1-indexed. Pass ``start_line=0, end_line=0`` (the @@ -376,7 +454,8 @@ class TraceReader: end_byte: int = 0, buffer_size: int = 4194304, query: Optional[str] = None, - ) -> Iterator[str]: + memory_budget: int = 0, + ) -> Iterator[memoryview]: """Return a streaming iterator over decoded lines. The C++ coroutine runs on the Runtime thread pool and pushes @@ -384,70 +463,74 @@ class TraceReader: """ ... - def iter_raw( + def iter_json( self, start_line: int = 0, end_line: int = 0, start_byte: int = 0, end_byte: int = 0, - line_aligned: bool = True, - multi_line: bool = True, buffer_size: int = 4194304, query: Optional[str] = None, - ) -> Iterator[bytes]: - """Return a streaming iterator over raw byte chunks. + batch_size: int = 1024, + memory_budget: int = 0, + ) -> Iterator["JsonDictValue"]: + """Return a streaming iterator over parsed JSON events. - When ``query`` is set and an index exists, chunk-level pruning - skips non-matching chunks. No per-event filtering is applied. + Each event is parsed once in C++ and yielded as a zero-copy + :class:`JsonDictValue` wrapper. No double-parsing overhead. """ ... - def read_raw( + def read_json( self, start_line: int = 0, end_line: int = 0, start_byte: int = 0, end_byte: int = 0, - line_aligned: bool = True, - multi_line: bool = True, buffer_size: int = 4194304, query: Optional[str] = None, - ) -> List[bytes]: - """Read raw byte chunks and return as a list. + batch_size: int = 1024, + ) -> List["JsonDictValue"]: + """Read all events as parsed :class:`JsonDictValue` wrappers (list). - When ``query`` is set and an index exists, chunk-level pruning - skips non-matching chunks. No per-event filtering is applied. + Equivalent to ``list(iter_json(...))``. """ ... - def iter_lines_json( + def iter_raw( self, start_line: int = 0, end_line: int = 0, start_byte: int = 0, end_byte: int = 0, + line_aligned: bool = True, + multi_line: bool = True, buffer_size: int = 4194304, query: Optional[str] = None, - ) -> Iterator["JSON"]: - """Return iterator over parsed JSON objects. + memory_budget: int = 0, + ) -> Iterator[memoryview]: + """Return a streaming iterator over raw byte chunks. - Skips non-JSON lines (array delimiters like ``[`` and ``]``). - Each yielded item is a lazy :class:`JSON` object. + When ``query`` is set and an index exists, chunk-level pruning + skips non-matching chunks. No per-event filtering is applied. """ ... - def read_lines_json( + def read_raw( self, start_line: int = 0, end_line: int = 0, start_byte: int = 0, end_byte: int = 0, + line_aligned: bool = True, + multi_line: bool = True, buffer_size: int = 4194304, query: Optional[str] = None, - ) -> List["JSON"]: - """Read lines and return as list of parsed JSON objects. + ) -> List[memoryview]: + """Read raw byte chunks and return as a list. - Equivalent to ``list(self.iter_lines_json(...))``. + When ``query`` is set and an index exists, chunk-level pruning + skips non-matching chunks. No per-event filtering is applied. """ ... @@ -460,21 +543,42 @@ class TraceReader: end_byte: int = 0, buffer_size: int = 4194304, query: Optional[str] = None, - ) -> Iterator[Any]: + flatten_objects: bool = False, + normalize: bool = False, + memory_budget: int = 0, + ) -> Iterator["_ArrowBatchCapsule"]: """Return iterator over Arrow record batches. Each batch is an ``_ArrowBatchCapsule`` implementing the Arrow PyCapsule protocol (``__arrow_c_array__``). Wrap with :class:`~dftracer.utils.arrow.ArrowBatch` for convenience methods, or pass directly to ``pyarrow.record_batch()``. + """ + ... - Args: - batch_size (int): Maximum rows per Arrow batch. - start_line (int): First line (0 = beginning). - end_line (int): Last line (0 = end of file). - start_byte (int): First byte offset (0 = beginning). - end_byte (int): Last byte offset (0 = end of file). - buffer_size (int): Internal read buffer size in bytes. + def iter_arrow_stream( + self, + batch_size: int = 10000, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + flatten_objects: bool = False, + normalize: bool = False, + memory_budget: int = 0, + ) -> "_ArrowBatchStream": + """Return an Arrow C Data Interface stream over record batches. + + PyArrow can drain the producer channel in a single C-side call: + + rbr = pa.RecordBatchReader.from_stream(reader.iter_arrow_stream()) + for batch in rbr: + ... + + Equivalent data to :meth:`iter_arrow`, but without per-batch + Python ↔ C transitions. """ ... @@ -487,19 +591,13 @@ class TraceReader: end_byte: int = 0, buffer_size: int = 4194304, query: Optional[str] = None, - ) -> Any: + flatten_objects: bool = False, + normalize: bool = False, + ) -> "ArrowTable": """Read all events as an ArrowTable. Equivalent to collecting all batches from :meth:`iter_arrow` into an :class:`~dftracer.utils.arrow.ArrowTable`. - - Args: - batch_size (int): Maximum rows per Arrow batch. - start_line (int): First line (0 = beginning). - end_line (int): Last line (0 = end of file). - start_byte (int): First byte offset (0 = beginning). - end_byte (int): Last byte offset (0 = end of file). - buffer_size (int): Internal read buffer size in bytes. """ ... @@ -521,8 +619,8 @@ class TraceReader: ... @property - def file_path(self) -> str: - """Path to the trace file.""" + def path(self) -> str: + """Path to the trace file or directory.""" ... @property @@ -540,6 +638,95 @@ class TraceReader: """Total line count (reads all lines to compute if needed).""" ... + def write_arrow( + self, + path: str, + views: Optional[List[Union[str, Dict[str, Any]]]] = None, + chunk_size_mb: int = 32, + compression: str = "zstd", + batch_size: int = 10000, + ) -> Dict[str, Any]: + """Write trace data to Arrow IPC files with optional view-based partitioning. + + Args: + path: Output directory for Arrow IPC files. + views: List of view definitions. Each can be: + - A string: predefined view name ('io', 'compute', 'dlio') + - A dict with 'name' and optional 'query', 'include_metadata' + If None, writes all events to 'all' partition. + chunk_size_mb: Maximum uncompressed size per file in MB. + compression: 'zstd' or 'none'. + batch_size: Events per Arrow batch. + + Returns: + Dict with partitions, total_rows, total_bytes, chunks_scanned, chunks_skipped. + """ + ... + + def get_view_chunks( + self, + view: Optional[Union[str, Dict[str, Any]]] = None, + ) -> Dict[str, Any]: + """Get candidate chunks for a view after bloom filter pruning. + + Args: + view: View definition (string or dict with 'name' and optional 'query'). + + Returns: + Dict with chunks list, total_checkpoints, skipped_checkpoints, file_may_match. + """ + ... + + def write_view_chunk( + self, + output_file: str, + checkpoint_idx: int, + start_byte: int, + end_byte: int, + view: Optional[Union[str, Dict[str, Any]]] = None, + compression: str = "zstd", + batch_size: int = 10000, + ) -> Dict[str, Any]: + """Write a single chunk to an Arrow IPC file. + + Args: + output_file: Path to output Arrow IPC file. + checkpoint_idx: Checkpoint index. + start_byte: Start byte offset. + end_byte: End byte offset. + view: View definition. + compression: 'zstd' or 'none'. + batch_size: Events per batch. + + Returns: + Dict with output_file, events_matched, rows_written, bytes_written. + """ + ... + + def write_view_chunks( + self, + chunks: List[Dict[str, Any]], + output_dir: str, + view: Optional[Union[str, Dict[str, Any]]] = None, + compression: str = "zstd", + batch_size: int = 10000, + ) -> Dict[str, Any]: + """Write multiple chunks to Arrow IPC files in parallel. + + All chunks are processed concurrently on the Runtime thread pool. + + Args: + chunks: List of dicts with checkpoint_idx, start_byte, end_byte. + output_dir: Directory for output Arrow IPC files. + view: View definition. + compression: 'zstd' or 'none'. + batch_size: Events per batch. + + Returns: + Dict with results list, total_rows, total_events_matched. + """ + ... + def __enter__(self) -> "TraceReader": """Enter the runtime context for the with statement.""" ... @@ -733,3 +920,205 @@ class ComparatorUtility: force_rebuild: bool = False, config: str = "", ) -> str: ... + +# ========== ARROW PARALLEL READER ========== + +def read_arrow_files_parallel( + paths: List[str], + runtime: Optional[Runtime] = None, +) -> Dict[str, Any]: + """Read multiple Arrow IPC files in parallel using the Runtime. + + Args: + paths: List of file paths to read. + runtime: Optional Runtime object. Uses default if not provided. + + Returns: + dict with: + - file_results: List of per-file results, each with: + - path: File path + - success: True if read succeeded + - error: Error message if failed, else None + - total_rows: Number of rows in file + - batches: List of ArrowBatch objects + - total_rows: Total rows across all files + - total_batches: Total batches across all files + - files_read: Number of files read successfully + - files_failed: Number of files that failed + """ + ... + +# ========== DISTRIBUTED INDEX (SST-based) ========== + +class IndexDatabase: + """Handle to a .dftindex RocksDB store. + + Used by the distributed indexer coordinator to pre-register files, + reserve file_id ranges, bulk-ingest worker-produced SSTs, and rebuild + root summaries. + """ + + def __init__(self, index_path: str) -> None: ... + def init_schema(self) -> None: ... + def register_files(self, paths: List[str], build_manifest: bool = False) -> List[int]: + """Register each path in the DEFAULT-CF file registry and return + the assigned file_ids (parallel to `paths`). Idempotent for files + with matching hash.""" + ... + + def reserve_file_id_range(self, count: int) -> int: + """Atomically reserve `count` contiguous file_ids; return first.""" + ... + + def bulk_ingest( + self, + registry: "SstArtifactRegistry", + skip_cfs: Optional[Iterable[str]] = None, + ) -> None: + """Ingest all SSTs collected in the registry. + + skip_cfs is an optional iterable of CF names whose SSTs are left + outside the unified DB. Distributed builds pass + {"aggregation", "system_metrics"} to keep per-worker AGG/SYS SSTs + addressable via `agg_manifest.json` for parallel reads at analyze + time. See `dftracer.utils.dask.consolidate_index` to fold them + back into the unified DB later. + """ + ... + + def rebuild_root_summaries(self) -> None: + """Recompute ROOT_* summary column families from per-file CFs.""" + ... + + def write_agg_global_config(self, time_interval_us: int, config_hash: int = 0) -> None: + """Write the aggregation global-config marker into the AGGREGATION CF. + + Required for `Indexer.iter_arrow_dfanalyzer_all` on distributed + builds (which never materialise the key via worker SSTs) and + post-consolidate indices. + """ + ... + + def write_agg_file_markers(self, file_ids: Iterable[int]) -> None: + """Write per-file aggregation completion markers into the AGGREGATION CF. + + Each marker is ``\\xFF\\xFF + file_id_be32``. The index resolver uses + their presence to decide whether each file has aggregated data; if + missing, ``ensure_indexed()`` concludes the aggregation tier is + incomplete and re-runs the entire build. Distributed_index must + call this after ``bulk_ingest`` so subsequent ``read_trace`` calls + do not redundantly re-aggregate. + """ + ... + + def write_aggregation_tracker(self, blobs: List[bytes]) -> None: + """Merge serialized AssociationTracker blobs and write the result + to the AGGREGATION CF under the ``__tracker__`` key.""" + ... + +class SstArtifactRegistry: + """Thread-safe collector for SST artifact paths produced by workers.""" + + def __init__(self) -> None: ... + def append(self, artifacts_dict: Dict[str, Optional[str]]) -> None: + """Add a per-batch Artifacts dict as returned by `build_sst_batch`.""" + ... + +def build_sst_batch( + files: List[str], + file_ids: List[int], + staging_dir: str, + batch_id: str, + index_dir: str = "", + checkpoint_size: int = 33554432, + build_manifest: bool = False, + force_rebuild: bool = False, + bloom_dimensions: Optional[List[str]] = None, + parallelism: int = 0, + flush_every_files: int = 0, + runtime: Optional[Union[Runtime, object]] = None, + aggregation_config: Optional[Any] = None, + file_slices: Optional[List[Optional[Tuple[int, int, int, bool, List[Tuple[int, int]]]]]] = None, +) -> Tuple[List[Dict[str, Optional[str]]], bytes]: + """Run the indexer pipeline with an SST sink. Returns + `(artifact_dicts, tracker_blob)`. `tracker_blob` is the serialized + merged AssociationTracker for the batch (empty bytes when + `aggregation_config` is None). `file_slices` enables intra-file + parallelism; entries are `None` (whole file) or + `(member_begin, member_end, checkpoint_idx_base, + skip_file_scoped_writes, members)`.""" + ... + +def plan_lpt_partition( + entries: List[Tuple[str, int]], num_workers: int +) -> List[List[Tuple[str, int]]]: + """Greedy LPT bin-packing of (path, size) tuples into num_workers + buckets, minimising the maximum per-worker total size.""" + ... + +def scan_files( + directory: str, + patterns: Optional[List[str]] = None, + recursive: bool = False, + runtime: Optional[Union[Runtime, object]] = None, +) -> List[Tuple[str, int]]: + """Parallel directory scan returning (path, size) tuples for regular + files matching the patterns.""" + ... + +def enable_aggregation_deterministic_ids() -> None: + """Flip the global aggregation StringIntern into deterministic-id mode + so the same string maps to the same 32-bit id in every worker process.""" + ... + +def move_artifacts(artifacts: Dict[str, Optional[str]], dest_dir: str) -> Dict[str, Optional[str]]: + """Move every populated SST in `artifacts` into `dest_dir` via the + C++ rename/copy helper, returning a fresh dict with the new paths.""" + ... + +def enumerate_gzip_members( + files: List[str], + runtime: Optional[Union[Runtime, object]] = None, +) -> List[List[Tuple[int, int]]]: + """Cooperative async scan of gzip member offsets. Returns lists of + `(c_offset, c_size)` parallel to `files`; empty for non-gzip files.""" + ... + +def plan_work_units( + member_map: List[List[Tuple[int, int]]], + num_workers: int, + target_c_size: int = 0, +) -> List[List[Tuple[int, int, int, int]]]: + """Deterministic LPT assignment of intra-file gzip-member slices across + workers. Returns per-worker lists of + `(file_idx, member_begin, member_end, c_size)`.""" + ... + +def scan_aggregation_manifest( + agg_ssts: List[str], + sys_ssts: List[str], + scratch_dir: str, + meta_index_path: str, + batch_size: int = 10000, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + query: Optional[str] = None, + group_by: Optional[List[str]] = None, + shard_begin: int = 0, + shard_end: int = 4096, + runtime: Optional[Union[Runtime, object]] = None, + file_hashes: Optional[Dict[str, str]] = None, + host_hashes: Optional[Dict[str, str]] = None, +) -> Dict[str, List[_ArrowBatchCapsule]]: + """Scan a worker's slice of the distributed aggregation manifest. + + Ingests `agg_ssts` + `sys_ssts` into a scratch IndexDatabase at + `scratch_dir` (caller owns the directory lifecycle) and runs the + dfanalyzer aggregation scan over `[shard_begin, shard_end)`. + `meta_index_path` is the unified .dftindex used to resolve file / + host hashes. + + Returns the same dict shape as `Indexer.iter_arrow_dfanalyzer_all`: + `{"events": [...], "profiles": [...], "system": [...]}`. + """ + ... diff --git a/python/dftracer/utils/indexer.py b/python/dftracer/utils/indexer.py new file mode 100644 index 00000000..c36754f6 --- /dev/null +++ b/python/dftracer/utils/indexer.py @@ -0,0 +1,371 @@ +"""Indexer utilities for building and managing trace indexes.""" + +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Set, Tuple, Union + +from .dftracer_utils_ext import CheckpointIndexer as _NativeCheckpointIndexer +from .dftracer_utils_ext import Indexer as _NativeIndexer +from .runtime import Runtime + +DEFAULT_CHECKPOINT_SIZE = 32 * 1024 * 1024 # 32MB + +FileInfo = Tuple[Dict[int, str], Dict[int, Set[int]]] + + +@dataclass +class AggregationConfig: + """Configuration for aggregation tier indexing. + + Attributes: + time_interval_ms: Time bucket size in milliseconds (default 5000). + group_keys: Extra grouping dimensions (default None). + custom_metric_fields: Extra numeric args fields to aggregate (default None). + compute_percentiles: Enable percentile sketch collection (default False). + """ + + time_interval_ms: float = 5000.0 + group_keys: Optional[List[str]] = None + custom_metric_fields: Optional[List[str]] = None + compute_percentiles: bool = False + + +@dataclass +class IndexStatus: + """Status of index resolution. + + Attributes: + total_files: Total number of files discovered. + ready: Files that are fully indexed for requested tiers. + needs_work: Files that need indexing. + index_path: Path to the .dftindex store. + """ + + total_files: int + ready: List[str] = field(default_factory=list) + needs_work: List[str] = field(default_factory=list) + index_path: str = "" + + +class Indexer: + """High-level indexer for building and managing trace indexes. + + Supports tiered indexing: + - Tier 1: Checkpoints (for random access) + - Tier 2: Bloom filters and manifests (for fast filtering) + - Tier 3: Aggregation data (config-dependent) + + At least one of 'directory' or 'files' must be provided. + + Args: + directory: Directory containing trace files (.pfw/.pfw.gz). + files: List of specific file paths to index. + index_dir: Directory for .dftindex stores (default: next to files). + require_checkpoint: Build checkpoint tier (default True). + require_bloom: Build bloom filter tier (default True). + require_manifest: Build manifest tier (default True). + require_aggregation: Aggregation config or True for defaults (default None). + parallelism: Number of parallel workers (0 = all cores). + force_rebuild: Force rebuild even if index exists. + runtime: Runtime for executor parallelism (default: global runtime). + + Example: + >>> indexer = Indexer("/path/to/traces") + >>> indexer.ensure_indexed() # builds checkpoint, bloom, manifest + + >>> # With explicit file list + >>> indexer = Indexer(files=["/path/to/trace1.pfw.gz", "/path/to/trace2.pfw.gz"]) + >>> indexer.ensure_indexed() + + >>> # With aggregation + >>> indexer = Indexer( + ... "/path/to/traces", + ... require_aggregation=AggregationConfig(time_interval_ms=1000), + ... ) + >>> indexer.ensure_indexed() # fused pass with aggregation + """ + + def __init__( + self, + directory: str = "", + files: Optional[List[str]] = None, + index_dir: str = "", + require_checkpoint: bool = True, + require_bloom: bool = True, + require_manifest: bool = True, + require_aggregation: Optional[Union[bool, AggregationConfig]] = None, + checkpoint_size: int = DEFAULT_CHECKPOINT_SIZE, + parallelism: int = 0, + force_rebuild: bool = False, + runtime: Optional[Runtime] = None, + ): + # Normalize aggregation config + if require_aggregation is True: + agg_config = AggregationConfig() + elif isinstance(require_aggregation, AggregationConfig): + agg_config = require_aggregation + else: + agg_config = None + + # Build native indexer + native_runtime = runtime._native if runtime else None + self._native = _NativeIndexer( + directory=directory, + files=files, + index_dir=index_dir, + require_checkpoint=require_checkpoint, + require_bloom=require_bloom, + require_manifest=require_manifest, + require_aggregation=agg_config is not None, + time_interval_ms=agg_config.time_interval_ms if agg_config else 5000.0, + group_keys=agg_config.group_keys if agg_config else None, + custom_metric_fields=agg_config.custom_metric_fields if agg_config else None, + compute_percentiles=agg_config.compute_percentiles if agg_config else False, + checkpoint_size=checkpoint_size, + parallelism=parallelism, + force_rebuild=force_rebuild, + runtime=native_runtime, + ) + self._aggregation_config = agg_config + self._file_info_cache: Optional[FileInfo] = None + self._closed = False + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + return False + + def close(self): + """Release resources.""" + self._closed = True + + @property + def aggregation_config(self) -> Optional[AggregationConfig]: + """Aggregation configuration, if enabled.""" + return self._aggregation_config + + def resolve(self) -> IndexStatus: + """Check what files exist vs need indexing. + + Returns: + IndexStatus with total_files, ready, and needs_work lists. + """ + result = self._native.resolve() + return IndexStatus( + total_files=result["total_files"], + ready=result["ready"], + needs_work=result["needs_work"], + index_path=result.get("index_path", ""), + ) + + def build(self) -> None: + """Build all missing index tiers based on require_* flags. + + This method builds indexes in parallel using the Runtime executor. + When aggregation is enabled, it performs a fused pass for efficiency. + """ + self._native.build() + + def ensure_indexed(self) -> IndexStatus: + """Resolve and build if needed. + + Convenience method that calls resolve() then build() if needed. + + Returns: + IndexStatus after building. + """ + result = self._native.ensure_indexed() + return IndexStatus( + total_files=result["total_files"], + ready=result["ready"], + needs_work=result["needs_work"], + index_path=result.get("index_path", ""), + ) + + def get_checkpoint_indexer(self, file_path: str) -> _NativeCheckpointIndexer: + """Get a checkpoint indexer for a specific file. + + Returns an indexer for checkpoint-level operations on a single file, + such as finding checkpoints for random access. + + Args: + file_path: Path to the trace file (.pfw/.pfw.gz). + + Returns: + Indexer instance for checkpoint operations (checkpoints, find_checkpoint, etc). + """ + return self._native.get_checkpoint_indexer(file_path) + + def get_hash_table(self, hash_type: str) -> dict: + """Query hash table mappings. + + Returns a dictionary mapping hash values to resolved names for the + given hash type. This is useful for resolving fhash/hhash values in + aggregated data. + + Args: + hash_type: One of 'file', 'host', 'string', or 'proc'. + + Returns: + dict mapping hash values (str) to resolved names (str). + + Example: + >>> indexer = Indexer("/path/to/traces") + >>> indexer.ensure_indexed() + >>> file_names = indexer.get_hash_table("file") + >>> # file_names = {"abc123": "/path/to/data.h5", ...} + """ + return self._native.get_hash_table(hash_type) + + def query_file_pids(self, file_id: int) -> set: + """Query PIDs observed in a specific file. + + Args: + file_id: Integer file ID from index. + + Returns: + set of PIDs (int) observed in the file. + """ + return self._native.query_file_pids(file_id) + + def query_all_file_pids(self) -> dict: + """Query PIDs for all indexed files. + + Returns a dictionary mapping file_id to the set of PIDs observed + in that file. This is useful for distributed aggregation to assign + files to workers by PID affinity. + + Returns: + dict mapping file_id (int) to set of PIDs (int). + """ + return self._native.query_all_file_pids() + + def query_file_info(self) -> FileInfo: + """Query file distribution info in a single DB open. + + Returns: + Tuple of (file_id_to_path, file_pids) where: + - file_id_to_path: dict[int, str] mapping DB file ID to path + - file_pids: dict[int, set[int]] mapping file ID to PIDs + """ + if self._file_info_cache is None: + self._file_info_cache = self._native.query_file_info() + return self._file_info_cache + + def iter_aggregation(self, type: str = "events", batch_size: int = 10000): + """Iterate over aggregation data as Arrow batches. + + Requires that the index was built with require_aggregation=True. + Returns Arrow batches that can be converted to pandas or pyarrow. + + Args: + type: Type of aggregation data - 'events', 'profiles', or 'system' + batch_size: Number of entries per Arrow batch (default 10000) + + Yields: + Arrow batch capsules implementing __arrow_c_array__ + + Example: + >>> import pyarrow as pa + >>> indexer = Indexer("/traces", require_aggregation=True) + >>> indexer.ensure_indexed() + >>> batches = [pa.record_batch(b) for b in indexer.iter_aggregation("events")] + >>> table = pa.concat_tables([pa.Table.from_batches([b]) for b in batches]) + """ + return self._native.iter_aggregation(type, batch_size) + + def iter_arrow_dfanalyzer( + self, + type: str = "events", + batch_size: int = 10000, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + query: Optional[str] = None, + ): + """Iterate over aggregation data as dfanalyzer-compatible Arrow batches. + + Returns Arrow batches with columns matching dfanalyzer schema: + + - Events/Profiles: cat, func_name, pid, tid, file_hash, host_hash, + file_name, host_name, proc_name, io_cat, acc_pat, count, time, size, + time_min, time_max, size_min, size_max, time_range, time_start, time_end + - System: host_hash, time_range, ``sys_cpu_*``, ``sys_mem_*`` + + Hash resolution, time normalization, and computed columns (proc_name, + io_cat) are done in C++ for performance. + + Args: + type: Type of aggregation data - 'events', 'profiles', or 'system' + batch_size: Number of entries per Arrow batch (default 10000) + time_granularity: Bucket width in seconds (default 1.0) + time_resolution: Microseconds per output time unit (default 1e6) + query: Optional query filter string (e.g., "pid == 1234 or pid == 5678") + + Yields: + Arrow batch capsules implementing __arrow_c_array__ + + Example: + >>> import pyarrow as pa + >>> indexer = Indexer("/traces", require_aggregation=True) + >>> indexer.ensure_indexed() + >>> batches = list(indexer.iter_arrow_dfanalyzer("events")) + >>> table = pa.concat_tables([pa.Table.from_batches([pa.record_batch(b)]) for b in batches]) + """ + if query is not None: + return self._native.iter_arrow_dfanalyzer( + type, batch_size, time_granularity, time_resolution, query + ) + return self._native.iter_arrow_dfanalyzer( + type, batch_size, time_granularity, time_resolution + ) + + def iter_arrow_dfanalyzer_all( + self, + batch_size: int = 10000, + time_granularity: float = 1.0, + time_resolution: float = 1e6, + query: Optional[str] = None, + group_by: Optional[List[str]] = None, + ): + """Iterate over all aggregation types in a single scan. + + This is ~3x faster than calling iter_arrow_dfanalyzer separately for + events, profiles, and system because it scans the index only once. + + When ``group_by`` is provided, aggregation collapses dimensions during + the scan and emits a reduced schema containing only the requested + group columns plus aggregated metrics (``count``, ``time``, ``size``, + ``time_sq``, ``size_sq``, ``time_min``, ``time_max``, ``size_min``, + ``size_max``, ``time_call_min``, ``time_call_max``, ``size_call_min``, + ``size_call_max``, ``time_start``, ``time_end``). + + Args: + batch_size: Number of entries per Arrow batch (default 10000) + time_granularity: Bucket width in seconds (default 1.0) + time_resolution: Microseconds per output time unit (default 1e6) + query: Optional query filter string (e.g., "pid == 1234 or pid == 5678") + group_by: Optional list of columns to group by for coarse in-scan + aggregation. Supported: ``cat``, ``func_name``, ``pid``, + ``tid``, ``file_hash``, ``host_hash``, ``file_name``, + ``host_name``, ``proc_name``, ``io_cat``, ``acc_pat``, + ``time_range``. + + Returns: + Dict with 'events', 'profiles', 'system' keys, each containing + a list of Arrow batch capsules. + + Example: + >>> import pyarrow as pa + >>> indexer = Indexer("/traces", require_aggregation=True) + >>> indexer.ensure_indexed() + >>> all_batches = indexer.iter_arrow_dfanalyzer_all() + >>> events = [pa.record_batch(b) for b in all_batches["events"]] + """ + return self._native.iter_arrow_dfanalyzer_all( + batch_size, + time_granularity, + time_resolution, + query, + group_by, + ) diff --git a/python/dftracer/utils/runtime.py b/python/dftracer/utils/runtime.py index 932f40ce..1dc88b7f 100644 --- a/python/dftracer/utils/runtime.py +++ b/python/dftracer/utils/runtime.py @@ -121,18 +121,19 @@ class Runtime: Example:: - with Runtime(threads=8, python_threads=4) as rt: + with Runtime(threads=8, io_threads=8, python_threads=4) as rt: h = rt.submit(lambda x: x * 2, 21) assert h.get() == 42 Args: threads: Number of C++ executor threads (0 = hardware_concurrency). + io_threads: Number of C++ I/O threads (0 = hardware_concurrency). python_threads: Number of Python ThreadPoolExecutor threads (0 = min(32, threads)). """ - def __init__(self, threads: int = 0, python_threads: int = 0) -> None: - self._native = _NativeRuntime(threads) + def __init__(self, threads: int = 0, io_threads: int = 0, python_threads: int = 0) -> None: + self._native = _NativeRuntime(threads=threads, io_threads=io_threads) self._init_fields(python_threads) def _init_fields(self, python_threads: int = 0) -> None: @@ -357,6 +358,11 @@ def threads(self) -> int: """Number of C++ worker threads.""" return self._native.threads + @property + def io_threads(self) -> int: + """Number of C++ I/O threads.""" + return self._native.io_threads + @property def python_threads(self) -> int: """Number of Python worker threads (0 if pool not yet created).""" diff --git a/python/dftracer/utils/trace_reader.py b/python/dftracer/utils/trace_reader.py new file mode 100644 index 00000000..3ec9fd84 --- /dev/null +++ b/python/dftracer/utils/trace_reader.py @@ -0,0 +1,413 @@ +"""Python TraceReader wrapping the native C extension. + +Delegates all native methods and adds iter_lines_json / read_lines_json +as shims over iter_arrow via pyarrow. +""" + +from __future__ import annotations + +from types import TracebackType +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Type, Union + +from .dftracer_utils_ext import ( + JsonDictValue, + _ArrowBatchCapsule, +) +from .dftracer_utils_ext import ( + TraceReader as _NativeTraceReader, +) + +if TYPE_CHECKING: + from .arrow import ArrowTable + from .runtime import Runtime + + +class TraceReader: + __slots__ = ("_native",) + + def __init__( + self, + path: str, + index_dir: str = "", + checkpoint_size: int = 33554432, + auto_build_index: bool = False, + runtime: Optional[Runtime] = None, + ) -> None: + if runtime is not None: + self._native = _NativeTraceReader( + path, + index_dir=index_dir, + checkpoint_size=checkpoint_size, + auto_build_index=auto_build_index, + runtime=runtime, + ) + else: + self._native = _NativeTraceReader( + path, + index_dir=index_dir, + checkpoint_size=checkpoint_size, + auto_build_index=auto_build_index, + ) + + # -- properties -- + + @property + def path(self) -> str: + return self._native.path + + @property + def index_dir(self) -> str: + return self._native.index_dir + + @property + def has_index(self) -> bool: + return self._native.has_index + + @property + def num_lines(self) -> int: + return self._native.num_lines + + # -- lines -- + + def read_lines( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + ) -> List[memoryview]: + return self._native.read_lines( + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + buffer_size=buffer_size, + query=query, + ) + + def iter_lines( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + memory_budget: int = 0, + ) -> Iterator[memoryview]: + return self._native.iter_lines( + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + buffer_size=buffer_size, + query=query, + memory_budget=memory_budget, + ) + + # -- json -- + + def iter_json( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + batch_size: int = 1024, + memory_budget: int = 0, + ) -> Iterator[JsonDictValue]: + return self._native.iter_json( + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + buffer_size=buffer_size, + query=query, + batch_size=batch_size, + memory_budget=memory_budget, + ) + + def read_json( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + batch_size: int = 1024, + ) -> List[JsonDictValue]: + return self._native.read_json( + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + buffer_size=buffer_size, + query=query, + batch_size=batch_size, + ) + + # -- raw -- + + def read_raw( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + line_aligned: bool = True, + multi_line: bool = True, + buffer_size: int = 4194304, + query: Optional[str] = None, + ) -> List[memoryview]: + return self._native.read_raw( + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + line_aligned=line_aligned, + multi_line=multi_line, + buffer_size=buffer_size, + query=query, + ) + + def iter_raw( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + line_aligned: bool = True, + multi_line: bool = True, + buffer_size: int = 4194304, + query: Optional[str] = None, + memory_budget: int = 0, + ) -> Iterator[memoryview]: + return self._native.iter_raw( + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + line_aligned=line_aligned, + multi_line=multi_line, + buffer_size=buffer_size, + query=query, + memory_budget=memory_budget, + ) + + # -- arrow -- + + def iter_arrow( + self, + batch_size: int = 10000, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + flatten_objects: bool = False, + normalize: bool = False, + memory_budget: int = 0, + ) -> Iterator[_ArrowBatchCapsule]: + return self._native.iter_arrow( + batch_size=batch_size, + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + buffer_size=buffer_size, + query=query, + flatten_objects=flatten_objects, + normalize=normalize, + memory_budget=memory_budget, + ) + + def iter_arrow_stream( + self, + batch_size: int = 10000, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + flatten_objects: bool = False, + normalize: bool = False, + memory_budget: int = 0, + ) -> Any: + return self._native.iter_arrow_stream( + batch_size=batch_size, + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + buffer_size=buffer_size, + query=query, + flatten_objects=flatten_objects, + normalize=normalize, + memory_budget=memory_budget, + ) + + def read_arrow( + self, + batch_size: int = 10000, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + flatten_objects: bool = False, + normalize: bool = False, + ) -> ArrowTable: + return self._native.read_arrow( + batch_size=batch_size, + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + buffer_size=buffer_size, + query=query, + flatten_objects=flatten_objects, + normalize=normalize, + ) + + # -- JSON shims via arrow -- + + def iter_lines_json( + self, + batch_size: int = 10000, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + ) -> Iterator[Dict[str, Any]]: + try: + import pyarrow as pa + except ImportError: + raise ImportError( + "pyarrow is required for iter_lines_json. Install with: pip install pyarrow" + ) from None + + for capsule in self._native.iter_arrow( + batch_size=batch_size, + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + buffer_size=buffer_size, + query=query, + ): + rb = pa.record_batch(capsule) + yield from rb.to_pylist() + + def read_lines_json( + self, + batch_size: int = 10000, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: Optional[str] = None, + ) -> List[Dict[str, Any]]: + return list( + self.iter_lines_json( + batch_size=batch_size, + start_line=start_line, + end_line=end_line, + start_byte=start_byte, + end_byte=end_byte, + buffer_size=buffer_size, + query=query, + ) + ) + + # -- metadata -- + + def get_max_bytes(self) -> int: + return self._native.get_max_bytes() + + def get_num_lines(self) -> int: + return self._native.get_num_lines() + + # -- arrow IPC writing -- + + def write_arrow( + self, + path: str, + views: Optional[List[Union[str, Dict[str, Any]]]] = None, + chunk_size_mb: int = 32, + compression: str = "zstd", + batch_size: int = 10000, + ) -> Dict[str, Any]: + return self._native.write_arrow( + path, + views=views, + chunk_size_mb=chunk_size_mb, + compression=compression, + batch_size=batch_size, + ) + + def get_view_chunks( + self, + view: Optional[Union[str, Dict[str, Any]]] = None, + ) -> Dict[str, Any]: + return self._native.get_view_chunks(view=view) + + def write_view_chunk( + self, + output_file: str, + checkpoint_idx: int, + start_byte: int, + end_byte: int, + view: Optional[Union[str, Dict[str, Any]]] = None, + compression: str = "zstd", + batch_size: int = 10000, + ) -> Dict[str, Any]: + return self._native.write_view_chunk( + output_file=output_file, + checkpoint_idx=checkpoint_idx, + start_byte=start_byte, + end_byte=end_byte, + view=view, + compression=compression, + batch_size=batch_size, + ) + + def write_view_chunks( + self, + chunks: List[Dict[str, Any]], + output_dir: str, + view: Optional[Union[str, Dict[str, Any]]] = None, + compression: str = "zstd", + batch_size: int = 10000, + ) -> Dict[str, Any]: + return self._native.write_view_chunks( + chunks=chunks, + output_dir=output_dir, + view=view, + compression=compression, + batch_size=batch_size, + ) + + # -- context manager -- + + def __enter__(self) -> TraceReader: + self._native.__enter__() + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> None: + self._native.__exit__(exc_type, exc_val, exc_tb) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 14b9493a..503bb86e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,15 +5,21 @@ add_rpath() need_zlib() -need_lz4() +if(DFTRACER_UTILS_ENABLE_LZ4) + need_lz4() +endif() +if(DFTRACER_UTILS_ENABLE_ZSTD) + need_zstd() +endif() need_rocksdb() need_argparse() need_ghc_filesystem() need_cpplogger() -need_yyjson() +need_simdjson() need_readerwriterqueue() need_concurrentqueue() need_tl_expected() +need_unordered_dense() if(DFTRACER_UTILS_ENABLE_ARROW) need_nanoarrow() @@ -32,6 +38,7 @@ set(DFTRACER_UTILS_CORE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/constants.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/format_detector.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/filesystem.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/memory_budget.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/env.cpp # Utilities ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/utils/timer.cpp @@ -62,7 +69,6 @@ set(DFTRACER_UTILS_CORE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/database.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/filesystem.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/key_codec.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/async.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/db_manager.cpp ) @@ -94,6 +100,11 @@ endif() set(DFTRACER_UTILS_UTILITIES_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/metadata_collector_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/chunk_writer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/layout.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/merge.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/chunk_manifest_mapper_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp @@ -105,7 +116,13 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES # Common utilities (shared across modules) ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/ddsketch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/log2_histogram.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/json/json_value.cpp +) + +list(APPEND DFTRACER_UTILS_UTILITIES_SOURCES + # JSON parser (On-Demand API) + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/json/parser.cpp # Query language (generic JSON filtering) ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/query/ast.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/query/parser.cpp @@ -113,10 +130,17 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/query/query.cpp # DFT Aggregators ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/chunk_mapper_utility.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp @@ -128,17 +152,25 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp # DFT Reorganization ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/reconstruction_planner.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/event_router.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp # DFT Statistics (trace statistics, aggregation, querying) ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/log2_histogram.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp @@ -155,14 +187,18 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES # Indexer ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/index_database.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/index_database_writer_context.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/index_builder_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/provenance_database.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp # Reader ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/reader/trace_reader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/indexer_c.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/helpers.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/index_encoding.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/checkpoint_size.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/error.cpp # Indexer factory @@ -184,6 +220,8 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES # Call Tree ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/call_tree.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/call_tree_internal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/json_serializer.cpp # Replay ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/replay/replay.cpp @@ -217,7 +255,11 @@ endif() if(DFTRACER_UTILS_ENABLE_ARROW_IPC) list(APPEND DFTRACER_UTILS_UTILITIES_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/ipc_reader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/ipc_writer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/parallel_reader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/partition_writer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/partition_router.cpp ) endif() @@ -245,7 +287,7 @@ else() endif() # Add other dependencies built with CPM -set(PKG_CONFIG_LIBS_PRIVATE "${PKG_CONFIG_LIBS_PRIVATE} -lyyjson") +set(PKG_CONFIG_LIBS_PRIVATE "${PKG_CONFIG_LIBS_PRIVATE} -lsimdjson") # +++++++++++++++++++++++++++++++++++++++++ # Core Library @@ -314,13 +356,23 @@ foreach(variant shared static) if(TARGET dftracer_utils_core_${variant}) # Link dependencies using helper functions link_cpp_logger(dftracer_utils_core_${variant} ${VARIANT_UPPER}) - link_yyjson(dftracer_utils_core_${variant} ${VARIANT_UPPER}) + link_simdjson(dftracer_utils_core_${variant} ${VARIANT_UPPER}) link_rocksdb(dftracer_utils_core_${variant} ${VARIANT_UPPER}) link_zlib(dftracer_utils_core_${variant} ${VARIANT_UPPER}) + link_unordered_dense(dftracer_utils_core_${variant}) # Add stdfs if needed add_stdfs_if_needed(dftracer_utils_core_${variant}) + # mpi_utils.cpp is part of the core library sources when MPI is + # enabled, so core itself needs the MPI include path and runtime. + # Without this the precompiled header pulls in and fails + # with "mpi.h: No such file or directory". + if(DFTRACER_UTILS_ENABLE_MPI) + target_compile_definitions(dftracer_utils_core_${variant} PUBLIC DFTRACER_UTILS_MPI_ENABLED) + target_link_libraries(dftracer_utils_core_${variant} PUBLIC MPI::MPI_CXX) + endif() + # Set warnings target_set_warnings(dftracer_utils_core_${variant}) @@ -416,6 +468,31 @@ foreach(variant shared static) string(TOUPPER ${variant} VARIANT_UPPER) link_nanoarrow(dftracer_utils_utilities_${variant} ${VARIANT_UPPER}) endif() + + # Link zstd when ENABLE_ZSTD is on so headers propagate to consumers + # (e.g. arrow ipc_writer.cpp guarded by DFTRACER_UTILS_ENABLE_ZSTD). + if(DFTRACER_UTILS_ENABLE_ZSTD) + if(TARGET zstd::libzstd_shared) + target_link_libraries(dftracer_utils_utilities_${variant} + PRIVATE zstd::libzstd_shared) + elseif(TARGET zstd::libzstd_static) + target_link_libraries(dftracer_utils_utilities_${variant} + PRIVATE zstd::libzstd_static) + elseif(TARGET libzstd_shared) + target_link_libraries(dftracer_utils_utilities_${variant} + PRIVATE libzstd_shared) + elseif(TARGET libzstd_static) + target_link_libraries(dftracer_utils_utilities_${variant} + PRIVATE libzstd_static) + endif() + endif() + + # Lustre stripe query (optional). The config header carries the + # DFTRACER_UTILS_HAVE_LUSTREAPI define; we just need to link the lib. + if(DFTRACER_UTILS_HAVE_LUSTREAPI) + target_link_libraries(dftracer_utils_utilities_${variant} + PRIVATE ${LUSTREAPI_LIBRARY}) + endif() endif() endforeach() @@ -755,6 +832,13 @@ if(DFTRACER_UTILS_BUILD_BINARIES) ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_server.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_comparator.cpp) + set(DFTRACER_MPI_BINARIES "") + if(DFTRACER_UTILS_ENABLE_MPI) + list(APPEND DFTRACER_MPI_BINARIES + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp) + endif() + foreach(bin ${DFTRACER_BINARIES}) string(REPLACE ".cpp" "" bin_exec ${bin}) string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/" "" @@ -791,6 +875,37 @@ if(DFTRACER_UTILS_BUILD_BINARIES) create_python_wrapper(${bin_exec}) endif() endforeach() + + foreach(bin ${DFTRACER_MPI_BINARIES}) + string(REPLACE ".cpp" "" bin_exec ${bin}) + string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/" "" + bin_exec ${bin_exec}) + + add_executable(${bin_exec} ${bin}) + set_target_properties( + ${bin_exec} PROPERTIES OUTPUT_NAME "${bin_exec}" RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/bin) + target_add_rpath(${bin_exec}) + + target_link_libraries(${bin_exec} + PRIVATE dftracer_utils argparse::argparse + MPI::MPI_CXX) + target_include_directories(${bin_exec} + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include) + add_stdfs_if_needed(${bin_exec}) + target_set_warnings(${bin_exec}) + + if(DFTRACER_UTILS_COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + target_compile_options(${bin_exec} PRIVATE --coverage -fprofile-arcs + -ftest-coverage) + target_link_libraries(${bin_exec} PRIVATE --coverage) + endif() + + install(TARGETS ${bin_exec} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + if(SKBUILD) + create_python_wrapper(${bin_exec}) + endif() + endforeach() endif() # ############################################################################## @@ -871,6 +986,12 @@ if(DFTRACER_UTILS_BUILD_PYTHON) ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/dftracer_utils_ext.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/indexer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/indexer.h + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/index_database.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/index_database.h + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/sst_distribution.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/sst_distribution.h + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/batch_indexer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/batch_indexer.h ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/indexer_checkpoint.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/indexer_checkpoint.h ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/trace_reader.cpp @@ -879,13 +1000,19 @@ if(DFTRACER_UTILS_BUILD_PYTHON) ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/runtime.h ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/task_handle.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/task_handle.h + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/memoryview_batch.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/memoryview_batch.h ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/trace_reader_iterator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/trace_reader_iterator.h - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/json.h ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/json.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/json.h ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/statistics_query.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_helpers.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_helpers.h + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_stream_capsule.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_stream_capsule.h + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/schema_reconcile.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/schema_reconcile.h ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/statistics_query.h ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/statistics_aggregator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/statistics_aggregator.h @@ -898,7 +1025,11 @@ if(DFTRACER_UTILS_BUILD_PYTHON) ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/aggregator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/aggregator.h ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/comparator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/comparator.h) + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/comparator.h + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/streaming_iterator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/streaming_iterator.h + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_parallel_reader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_parallel_reader.h) # Link to unified library to test if this fixes the bus error target_link_libraries(dftracer_utils_ext PRIVATE dftracer_utils::shared) diff --git a/src/dftracer/utils/binaries/common_cli.h b/src/dftracer/utils/binaries/common_cli.h new file mode 100644 index 00000000..d4afb592 --- /dev/null +++ b/src/dftracer/utils/binaries/common_cli.h @@ -0,0 +1,329 @@ +#ifndef DFTRACER_UTILS_BINARIES_COMMON_CLI_H +#define DFTRACER_UTILS_BINARIES_COMMON_CLI_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::cli { + +class ArgParse; + +struct CliSchema { + virtual ~CliSchema() = default; + virtual void register_on(argparse::ArgumentParser& p) = 0; + virtual void parse_from(const argparse::ArgumentParser& p) = 0; + virtual bool validate() { return true; } +}; + +class ArgParse { + public: + explicit ArgParse(argparse::ArgumentParser& parser) : parser_(parser) {} + virtual ~ArgParse() = default; + + ArgParse(const ArgParse&) = delete; + ArgParse& operator=(const ArgParse&) = delete; + + void setup() { + for (auto* s : schemas_) s->register_on(parser_); + register_args(); + } + + bool parse(int argc, char** argv) { + try { + parser_.parse_args(argc, argv); + } catch (const std::exception& err) { + DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what()); + std::fprintf(stderr, "%s\n", parser_.help().str().c_str()); + return false; + } + for (auto* s : schemas_) s->parse_from(parser_); + post_parse(); + for (auto* s : schemas_) { + if (!s->validate()) return false; + } + return validate(); + } + + template + void schema(Schemas&... args) { + (schemas_.push_back(&args), ...); + } + + protected: + virtual void register_args() {} + virtual void post_parse() {} + virtual bool validate() { return true; } + + argparse::ArgumentParser& parser() { return parser_; } + const argparse::ArgumentParser& parser() const { return parser_; } + + private: + argparse::ArgumentParser& parser_; + std::vector schemas_; +}; + +enum class DirMode { DEFAULT_DOT, DEFAULT_EMPTY, REQUIRED }; + +struct DirectoryArgs : CliSchema { + DirMode mode = DirMode::DEFAULT_DOT; + std::string help = "Directory containing trace files"; + std::string value; + + DirectoryArgs() = default; + explicit DirectoryArgs(DirMode m) : mode(m) {} + DirectoryArgs(DirMode m, std::string h) : mode(m), help(std::move(h)) {} + + void register_on(argparse::ArgumentParser& p) override { + auto& arg = p.add_argument("-d", "--directory").help(help); + switch (mode) { + case DirMode::DEFAULT_DOT: + arg.default_value("."); + break; + case DirMode::DEFAULT_EMPTY: + arg.default_value(""); + break; + case DirMode::REQUIRED: + arg.required(); + break; + } + } + + void parse_from(const argparse::ArgumentParser& p) override { + value = p.get("--directory"); + } + + bool validate() override { + if (mode == DirMode::REQUIRED && value.empty()) { + DFTRACER_UTILS_LOG_ERROR("%s", "--directory is required"); + return false; + } + if (!value.empty() && !fs::exists(value)) { + DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s", + value.c_str()); + return false; + } + return true; + } +}; + +struct FilesArgs : CliSchema { + std::string help = "Trace files (.pfw, .pfw.gz)"; + std::vector value; + + FilesArgs() = default; + explicit FilesArgs(std::string h) : help(std::move(h)) {} + + void register_on(argparse::ArgumentParser& p) override { + p.add_argument("--files") + .help(help) + .nargs(argparse::nargs_pattern::any) + .default_value>({}); + } + + void parse_from(const argparse::ArgumentParser& p) override { + value = p.get>("--files"); + } +}; + +struct PipelineArgs : CliSchema { + std::size_t executor_threads = 0; + std::size_t io_threads = 0; + bool time_profiling = false; + + PipelineArgs() = default; + + void register_on(argparse::ArgumentParser& p) override { + p.add_group("Pipeline"); + p.add_argument("--executor-threads") + .help( + "Number of worker threads for parallel processing " + "(default: number of CPU cores)") + .scan<'d', std::size_t>() + .default_value(static_cast( + dftracer_utils_hardware_concurrency())); + p.add_argument("--io-threads") + .help( + "Number of I/O threads " + "(default: number of CPU cores)") + .scan<'d', std::size_t>() + .default_value(dftracer_utils_hardware_concurrency()); + p.add_argument("--time-profiling") + .help("Print stage timing breakdown to stderr") + .flag(); + } + + void parse_from(const argparse::ArgumentParser& p) override { + executor_threads = p.get("--executor-threads"); + io_threads = p.get("--io-threads"); + time_profiling = p.get("--time-profiling"); + } + + bool validate() override { + if (executor_threads == 0) { + DFTRACER_UTILS_LOG_ERROR( + "%s", "--executor-threads must be greater than 0"); + return false; + } + return true; + } + + void apply(PipelineConfig& config) const { + config.with_compute_threads(executor_threads); + config.with_io_threads(io_threads); + } +}; + +struct IndexingArgs : CliSchema { + std::string index_dir; + std::size_t checkpoint_size = 0; + bool force = false; + + std::string index_dir_help = "Directory for .dftindex stores"; + std::string force_help = "Force index recreation"; + bool with_index_dir = true; + bool with_force = true; + + IndexingArgs() = default; + explicit IndexingArgs(bool f) : with_force(f) {} + + void register_on(argparse::ArgumentParser& p) override { + p.add_group("Indexing"); + if (with_index_dir) { + p.add_argument("--index-dir") + .help(index_dir_help) + .default_value(""); + } + p.add_argument("--checkpoint-size") + .help("Checkpoint size for gzip indexing in bytes (default: " + + std::to_string(constants::indexer::DEFAULT_CHECKPOINT_SIZE) + + ")") + .scan<'d', std::size_t>() + .default_value(static_cast( + constants::indexer::DEFAULT_CHECKPOINT_SIZE)); + if (with_force) { + p.add_argument("-f", "--force").help(force_help).flag(); + } + } + + void parse_from(const argparse::ArgumentParser& p) override { + if (with_index_dir) { + index_dir = p.get("--index-dir"); + } + checkpoint_size = p.get("--checkpoint-size"); + if (with_force) { + force = p.get("--force"); + } + } +}; + +struct QueryArgs : CliSchema { + std::string query; + std::string help = + "Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')"; + + QueryArgs() = default; + explicit QueryArgs(std::string h) : help(std::move(h)) {} + + void register_on(argparse::ArgumentParser& p) override { + p.add_group("Query"); + p.add_argument("--query").help(help).default_value(""); + } + + void parse_from(const argparse::ArgumentParser& p) override { + query = p.get("--query"); + } +}; + +struct WatchdogArgs : CliSchema { + bool disable = false; + int global_timeout = 0; + int task_timeout = 0; + int interval = 1; + int warning_threshold = 300; + int idle_timeout = 300; + int deadlock_timeout = 600; + + void register_on(argparse::ArgumentParser& p) override { + p.add_group("Watchdog"); + p.add_argument("--disable-watchdog") + .help("Disable watchdog for hang detection") + .flag(); + p.add_argument("--watchdog-global-timeout") + .help( + "Watchdog global timeout for pipeline execution in " + "seconds (0 = no timeout)") + .scan<'d', int>() + .default_value(0); + p.add_argument("--watchdog-task-timeout") + .help("Watchdog default task timeout in seconds (0 = no timeout)") + .scan<'d', int>() + .default_value(0); + p.add_argument("--watchdog-interval") + .help("Watchdog check interval in seconds") + .scan<'d', int>() + .default_value(1); + p.add_argument("--watchdog-warning-threshold") + .help("Watchdog long-running task warning threshold in seconds") + .scan<'d', int>() + .default_value(300); + p.add_argument("--watchdog-idle-timeout") + .help("Watchdog idle timeout in seconds (0 = use default)") + .scan<'d', int>() + .default_value(300); + p.add_argument("--watchdog-deadlock-timeout") + .help("Watchdog deadlock timeout in seconds (0 = use default)") + .scan<'d', int>() + .default_value(600); + } + + void parse_from(const argparse::ArgumentParser& p) override { + disable = p.get("--disable-watchdog"); + global_timeout = p.get("--watchdog-global-timeout"); + task_timeout = p.get("--watchdog-task-timeout"); + interval = p.get("--watchdog-interval"); + warning_threshold = p.get("--watchdog-warning-threshold"); + idle_timeout = p.get("--watchdog-idle-timeout"); + deadlock_timeout = p.get("--watchdog-deadlock-timeout"); + } + + void apply(PipelineConfig& config) const { + config.with_watchdog(!disable) + .with_global_timeout(std::chrono::seconds(global_timeout)) + .with_task_timeout(std::chrono::seconds(task_timeout)) + .with_watchdog_interval(std::chrono::seconds(interval)) + .with_warning_threshold(std::chrono::seconds(warning_threshold)) + .with_executor_idle_timeout(std::chrono::seconds(idle_timeout)) + .with_executor_deadlock_timeout( + std::chrono::seconds(deadlock_timeout)); + } +}; + +inline PipelineConfig build_pipeline_config(const std::string& name, + const PipelineArgs& pipeline) { + auto config = PipelineConfig().with_name(name).with_watchdog(false); + pipeline.apply(config); + return config; +} + +inline PipelineConfig build_pipeline_config(const std::string& name, + const PipelineArgs& pipeline, + const WatchdogArgs& watchdog) { + auto config = PipelineConfig().with_name(name); + pipeline.apply(config); + watchdog.apply(config); + return config; +} + +} // namespace dftracer::utils::cli + +#endif // DFTRACER_UTILS_BINARIES_COMMON_CLI_H diff --git a/src/dftracer/utils/binaries/dftracer_aggregator.cpp b/src/dftracer/utils/binaries/dftracer_aggregator.cpp index c511cabe..6e1a4d12 100644 --- a/src/dftracer/utils/binaries/dftracer_aggregator.cpp +++ b/src/dftracer/utils/binaries/dftracer_aggregator.cpp @@ -1,60 +1,330 @@ #include -#include -#include -#include #include #include -#include #include #include -#include +#include +#include #include -#include -#include -#include +#include #include -#include +#include + +#include "common_cli.h" +#include "dftracer/utils/core/utils/timer.h" #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC #include #endif -#include - -#include -#include -#include #include -#include +#include using namespace dftracer::utils; using namespace dftracer::utils::utilities; using namespace dftracer::utils::utilities::composites::dft::aggregators; -static coro::CoroTask run_aggregator(argparse::ArgumentParser& program) { - std::string log_dir = program.get("--directory"); - std::string output_file = program.get("--output"); - double time_interval_ms = program.get("--time-interval"); +class AggregatorArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{ + cli::DirMode::DEFAULT_DOT, + "Input directory containing .pfw or .pfw.gz files"}; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + cli::QueryArgs query_args{ + "Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')"}; + + std::string output; + double time_interval = 5000.0; + std::string group_keys; + std::string metric_fields; + bool compress = false; + int compression_level = 1; + std::string boundary_events; + bool no_track_parents = false; + std::size_t chunk_size = 4; + std::size_t read_batch_size = 4; + std::string event_format = "counter"; + bool compute_percentiles = false; + std::string percentiles = "0.25,0.5,0.75,0.90"; + double relative_accuracy = 0.01; + std::string format = "json"; + bool no_default_args = false; + + explicit AggregatorArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.index_dir_help = + "Directory to store index files (default: system temp directory)"; + indexing.force_help = "Force index recreation"; + schema(directory, pipeline, indexing, query_args); + } + + protected: + void register_args() override { + parser() + .add_argument("-o", "--output") + .help("Output file path for aggregated counters") + .default_value("aggregated_output.json"); + + parser() + .add_argument("-t", "--time-interval") + .help("Time interval in milliseconds for bucketing (default: 5000)") + .scan<'g', double>() + .default_value(5000.0); + + parser() + .add_argument("-g", "--group-keys") + .help( + "Comma-separated extra group keys from args (e.g., " + "epoch,step,level)") + .default_value(""); + + parser() + .add_argument("-m", "--metric-fields") + .help( + "Comma-separated custom metric fields from args (e.g., " + "iter_count,num_events)") + .default_value(""); + + parser() + .add_argument("--compress") + .help("Compress output using gzip") + .default_value(false) + .implicit_value(true); + + parser() + .add_argument("--compression-level") + .help("Gzip compression level (0-9, default: 1)") + .scan<'d', int>() + .default_value(1); + + parser() + .add_argument("--boundary-events") + .help( + "Boundary event configuration: " + "event_name:value_field:output_name " + "(e.g., \"epoch.block:iter_count:epoch\")") + .default_value(""); + + parser() + .add_argument("--no-track-process-parents") + .help( + "Disable tracking of process parent relationships from " + "fork/spawn") + .default_value(false) + .implicit_value(true); + + parser() + .add_argument("--chunk-size") + .help( + "Target chunk size in MB for parallel processing (default: 4)") + .scan<'d', std::size_t>() + .default_value(static_cast(4)); + + parser() + .add_argument("--read-batch-size") + .help( + "Batch read size in MB for stream processing (default: 4, " + "higher = " + "faster but more memory)") + .scan<'d', std::size_t>() + .default_value(static_cast(4)); + + parser() + .add_argument("--event-format") + .help( + "Perfetto event format: 'counter' (ph=C, point-in-time, " + "default), " + "'async' (ph=b/e, async tracks with overlaps), " + "'regular' (ph=X, duration events with original TID)") + .default_value("counter"); + + parser() + .add_argument("--compute-percentiles") + .help( + "Enable percentile/quantile computation using DDSketch (opt-in " + "due " + "to memory overhead)") + .default_value(false) + .implicit_value(true); + + parser() + .add_argument("--percentiles") + .help( + "Comma-separated percentiles to compute (e.g., " + "\"0.25,0.5,0.75,0.90\" for P25, P50, P75, P90)") + .default_value("0.25,0.5,0.75,0.90"); + + parser() + .add_argument("--relative-accuracy") + .help( + "Relative accuracy for DDSketch percentile estimation " + "(default: 0.01 = 1%)") + .scan<'g', double>() + .default_value(0.01); + + parser() + .add_argument("--format") + .help( + "Output format: 'json' (Perfetto JSON, default) or " + "'arrow' (Arrow IPC file, .arrows extension)") + .default_value("json"); + + parser() + .add_argument("--no-default-args") + .help( + "Disable automatic aggregation of numeric event args " + "(offset, whence, flags, etc.)") + .default_value(false) + .implicit_value(true); + } + + void post_parse() override { + output = parser().get("--output"); + time_interval = parser().get("--time-interval"); + group_keys = parser().get("--group-keys"); + metric_fields = parser().get("--metric-fields"); + compress = parser().get("--compress"); + compression_level = parser().get("--compression-level"); + boundary_events = parser().get("--boundary-events"); + no_track_parents = parser().get("--no-track-process-parents"); + chunk_size = parser().get("--chunk-size"); + read_batch_size = parser().get("--read-batch-size"); + event_format = parser().get("--event-format"); + compute_percentiles = parser().get("--compute-percentiles"); + percentiles = parser().get("--percentiles"); + relative_accuracy = parser().get("--relative-accuracy"); + format = parser().get("--format"); + no_default_args = parser().get("--no-default-args"); + } +}; + +// Write global config and per-file tracking entries. +static void write_aggregation_tracking( + dftracer::utils::rocksdb::RocksDatabase* db, + const AggregationConfig& config, + const std::vector& processed_files, + const std::string& index_path) { + namespace rcf = dftracer::utils::rocksdb::cf; + + // Open index database to get file_ids + indexer::IndexDatabase idx_db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + + auto batch = db->begin_batch(); + + // Write global config once + AggGlobalConfig global_cfg; + global_cfg.time_interval_us = config.time_interval_us; + global_cfg.config_hash = 0; + db->put(batch, rcf::AGGREGATION, std::string_view(AGG_GLOBAL_CONFIG_KEY, 2), + serialize_agg_global_config(global_cfg)); + + // Per-file: empty value (presence = aggregated) + for (const auto& file_path : processed_files) { + int file_id = idx_db.find_file(file_path); + if (file_id >= 0) { + auto key = make_agg_file_key(file_id); + db->put(batch, rcf::AGGREGATION, key, ""); + } + } + + db->commit_batch(batch); +} + +static coro::CoroTask batch_index_and_aggregate( + CoroScope* scope, std::vector file_paths, + std::string index_dir, std::size_t checkpoint_size, bool force_rebuild, + std::size_t parallelism, AggregationConfig agg_config, + std::shared_ptr agg_db, + std::uint32_t config_hash) { + auto batch_config = std::make_shared(); + batch_config->file_paths = std::move(file_paths); + batch_config->index_dir = std::move(index_dir); + batch_config->checkpoint_size = checkpoint_size; + batch_config->parallelism = parallelism; + batch_config->force_rebuild = force_rebuild; + batch_config->use_batch_write = true; + + auto agg_config_ptr = + std::make_shared(std::move(agg_config)); + batch_config->dft_visitor_factory = + [agg_db, config_hash, agg_config_ptr](const std::string& file_path) + -> std::vector> { + std::vector> visitors; + visitors.push_back(std::make_unique( + agg_db, config_hash, *agg_config_ptr, file_path)); + return visitors; + }; + + co_return co_await indexer::IndexBatchBuilderUtility::process( + scope, std::move(batch_config)); +} + +static PerfettoTraceWriterInput build_streaming_input( + EventAggregator* merger_ptr, const AggregationConfig* agg_config, + const std::string* output_file, bool compress_output, int compression_level, + PerfettoEventFormat event_format) { + auto global_tracker = merger_ptr->build_global_tracker(); + + PerfettoTraceWriterInput input; + input.output_path = *output_file; + input.aggregator = merger_ptr; + input.tracker = global_tracker.get(); + input.agg_config = agg_config; + input.owned_tracker = std::move(global_tracker); + input.root_pids = input.tracker->get_root_pids(); + input.compute_statistics = agg_config->compute_statistics; + input.compute_percentiles = agg_config->compute_percentiles; + input.percentiles = agg_config->percentiles; + input.compress = compress_output; + input.compression_level = compression_level; + input.format = event_format; + + const auto& intervals = input.tracker->get_all_intervals(); + if (!intervals.empty()) { + std::uint64_t global_min = UINT64_MAX; + std::uint64_t global_max = 0; + for (const auto& interval : intervals) { + global_min = std::min(global_min, interval.start_ts); + global_max = std::max(global_max, interval.end_ts); + auto& range = input.boundary_ranges[interval.name][interval.value]; + if (range.ts == 0 && range.te == 0) { + range.ts = interval.start_ts; + range.te = interval.end_ts; + } else { + range.ts = std::min(range.ts, interval.start_ts); + range.te = std::max(range.te, interval.end_ts); + } + } + if (global_max > global_min) { + input.trace_duration = global_max - global_min; + } + } + + return input; +} + +static coro::CoroTask run_aggregator(const AggregatorArgParse* cli) { + auto log_dir = cli->directory.value; + auto output_file = cli->output; + auto time_interval_ms = cli->time_interval; std::uint64_t time_interval_us = static_cast(time_interval_ms * 1000.0); - std::string group_keys_str = program.get("--group-keys"); - std::string metric_fields_str = program.get("--metric-fields"); - std::string query_str = program.get("--query"); - bool force_rebuild = program.get("--force"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - std::size_t executor_threads = - program.get("--executor-threads"); - std::string index_dir = program.get("--index-dir"); - bool compress_output = program.get("--compress"); - int compression_level = program.get("--compression-level"); - std::string boundary_events_str = - program.get("--boundary-events"); - bool no_track_parents = program.get("--no-track-process-parents"); - std::size_t chunk_size_mb = program.get("--chunk-size"); - std::size_t batch_size_mb = program.get("--read-batch-size"); - std::string event_format_str = program.get("--event-format"); - bool compute_percentiles = program.get("--compute-percentiles"); - std::string percentiles_str = program.get("--percentiles"); - double relative_accuracy = program.get("--relative-accuracy"); - std::string output_format = program.get("--format"); + const auto& group_keys_str = cli->group_keys; + const auto& metric_fields_str = cli->metric_fields; + const auto& query_str = cli->query_args.query; + auto force_rebuild = cli->indexing.force; + auto checkpoint_size = cli->indexing.checkpoint_size; + auto executor_threads = cli->pipeline.executor_threads; + auto index_dir = cli->indexing.index_dir; + auto compress_output = cli->compress; + auto compression_level = cli->compression_level; + const auto& boundary_events_str = cli->boundary_events; + auto no_track_parents = cli->no_track_parents; + const auto& event_format_str = cli->event_format; + auto compute_percentiles = cli->compute_percentiles; + const auto& percentiles_str = cli->percentiles; + auto relative_accuracy = cli->relative_accuracy; + const auto& output_format = cli->format; if (!AggregationConfig::is_valid_format(output_format)) { DFTRACER_UTILS_LOG_ERROR( @@ -131,31 +401,6 @@ static coro::CoroTask run_aggregator(argparse::ArgumentParser& program) { } } - std::string temp_index_dir; - if (index_dir.empty()) { - try { - auto temp_path = fs::temp_directory_path(); - temp_path /= "dftracer_idx_" + std::to_string(std::time(nullptr)) + - "_" + std::to_string(getpid()); - temp_index_dir = temp_path.string(); - fs::create_directories(temp_index_dir); - index_dir = temp_index_dir; - DFTRACER_UTILS_LOG_INFO("Created temporary index directory: %s", - index_dir.c_str()); - } catch (const fs::filesystem_error& e) { - temp_index_dir = "/tmp/dftracer_idx_" + - std::to_string(std::time(nullptr)) + "_" + - std::to_string(getpid()); - fs::create_directories(temp_index_dir); - index_dir = temp_index_dir; - DFTRACER_UTILS_LOG_WARN( - "Failed to get system temp directory, using /tmp: %s", - e.what()); - DFTRACER_UTILS_LOG_INFO("Created temporary index directory: %s", - index_dir.c_str()); - } - } - log_dir = fs::absolute(log_dir).string(); output_file = fs::absolute(output_file).string(); @@ -220,31 +465,35 @@ static coro::CoroTask run_aggregator(argparse::ArgumentParser& program) { agg_config.percentiles = percentiles; agg_config.boundary_events = boundary_events; agg_config.track_process_parents = !no_track_parents; + agg_config.track_default_args = !cli->no_default_args; - using common::query::Query; - std::optional query; if (!query_str.empty()) { - auto result = Query::from_string(query_str); - if (!result) { - DFTRACER_UTILS_LOG_ERROR("Invalid --query: %s", - result.error().format().c_str()); - co_return 1; - } - query = std::move(*result); + DFTRACER_UTILS_LOG_WARN( + "--query is not yet supported in fused mode, ignoring"); } - // Discover input files - filesystem::PatternDirectoryScannerUtility scanner; - filesystem::PatternDirectoryScannerUtilityInput scan_input{ - log_dir, {".pfw", ".pfw.gz"}, false}; - auto matched_entries = co_await scanner.process(scan_input); - - std::vector input_files; - input_files.reserve(matched_entries.size()); - for (const auto& entry : matched_entries) { - input_files.push_back(entry.path.string()); + // Use hash=0 for simplicity (no config-based filtering) + constexpr std::uint32_t config_hash = 0; + + Timer stages_storage("dftracer_aggregator"); + Timer* stages = cli->pipeline.time_profiling ? &stages_storage : nullptr; + Timer overall(true); + + namespace idx = composites::dft::indexing; + + auto scan_result = std::make_unique(); + { + ScopedTimer _t(stages, "scan_and_resolve"); + idx::IndexResolverUtility resolver; + idx::ResolverInput input; + input.directory = log_dir; + input.index_dir = index_dir; + input.require_aggregation = !force_rebuild; + input.aggregation_config = agg_config; + *scan_result = co_await resolver.process(input); } + auto& input_files = scan_result->all_files; if (input_files.empty()) { DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s", log_dir.c_str()); @@ -253,442 +502,234 @@ static coro::CoroTask run_aggregator(argparse::ArgumentParser& program) { DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size()); - auto pipeline_config = PipelineConfig() - .with_name("DFTracer Aggregator") - .with_compute_threads(executor_threads) - .with_watchdog(false); + auto& shared_index_path = scan_result->index_path; + + auto pipeline_config = + cli::build_pipeline_config("DFTracer Aggregator", cli->pipeline); Pipeline pipeline(pipeline_config); - auto start_time = std::chrono::high_resolution_clock::now(); + if (force_rebuild && fs::exists(shared_index_path)) { + DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s", + shared_index_path.c_str()); + fs::remove_all(shared_index_path); + } - EventAggregatorUtility merger; - std::atomic global_chunk_idx{0}; + std::shared_ptr agg_db; + std::unique_ptr merger; + { + ScopedTimer _t(stages, "open_rocksdb"); + agg_db = EventAggregator::open_with_merge_operator(shared_index_path); + merger = std::make_unique(agg_db, config_hash); + } - if (force_rebuild && !input_files.empty()) { - const std::string shared_index_path = - composites::dft::internal::determine_index_path(input_files.front(), - index_dir); - if (fs::exists(shared_index_path)) { - DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s", - shared_index_path.c_str()); - fs::remove_all(shared_index_path); + // Files to process: needs_checkpoint (index + aggregate) + + // needs_aggregation + const std::size_t num_needing_index = scan_result->needs_checkpoint.size(); + const std::size_t num_needing_agg_only = + force_rebuild ? scan_result->cached.size() + : scan_result->needs_aggregation.size(); + const std::size_t num_cached = + force_rebuild ? 0 : scan_result->total_cached(); + + std::vector files_to_process; + files_to_process.reserve(num_needing_index + num_needing_agg_only); + for (auto& item : scan_result->needs_checkpoint) { + files_to_process.push_back(std::move(item.file_path)); + } + if (force_rebuild) { + for (auto& item : scan_result->cached) { + files_to_process.push_back(std::move(item.file_path)); + } + } else { + for (auto& item : scan_result->needs_aggregation) { + files_to_process.push_back(std::move(item.file_path)); } } - // Streaming aggregation: file producers -> chunk workers -> merger - auto streaming_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - auto chunk_chan = coro::make_channel(0); - auto result_chan = coro::make_channel(2); - - co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask { - // File producers: one per input file - for (const auto& file_path : input_files) { - auto* global_chunk_idx_ptr = &global_chunk_idx; - scope.spawn([file_path, ch = chunk_chan->producer(), - index_dir, checkpoint_size, force_rebuild, - agg_config, query, chunk_size_mb, - batch_size_mb, global_chunk_idx_ptr]( - CoroScope& /*fctx*/) mutable - -> coro::CoroTask { - [[maybe_unused]] auto producer_guard = ch.guard(); - // Build index - std::string index_path = - composites::dft::internal::determine_index_path( - file_path, index_dir); - auto idx_input = - indexer::IndexBuildConfig::for_file(file_path) - .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(false) - .with_index_dir(index_dir); - co_await indexer::IndexBuilderUtility{}.process( - idx_input); - - // Collect metadata - auto meta_input = - composites::dft::MetadataCollectorUtilityInput:: - from_file(file_path) - .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(false) - .with_index(index_path); - auto metadata = - co_await composites::dft::MetadataCollectorUtility{} - .process(meta_input); - - if (!metadata.success) { - DFTRACER_UTILS_LOG_WARN("Skipping file: %s", - file_path.c_str()); - co_return; - } - - // Create chunks for this file - FileChunkMapperUtility file_mapper; - auto mapper_input = - FileChunkMapperInput::from_metadata(metadata) - .with_config(agg_config) - .with_checkpoint_size(checkpoint_size) - .with_target_chunk_size(chunk_size_mb) - .with_batch_size(batch_size_mb * 1024 * 1024); - mapper_input.query = query; - auto file_chunks = - co_await file_mapper.process(mapper_input); - - int start_idx = global_chunk_idx_ptr->fetch_add( - static_cast(file_chunks.size())); - for (int i = 0; - i < static_cast(file_chunks.size()); ++i) { - file_chunks[i].chunk_index = start_idx + i; - } + DFTRACER_UTILS_LOG_INFO( + "Files to process: %zu (%zu need indexing, %zu need aggregation only, " + "%zu cached)", + files_to_process.size(), num_needing_index, num_needing_agg_only, + num_cached); - for (auto& chunk : file_chunks) { - if (!co_await ch.send(std::move(chunk))) { - co_return; + bool write_success = false; + std::size_t total_keys = 0; + std::atomic perfetto_keys_written{0}; + + auto main_task = make_task( + [&](CoroScope& scope) -> coro::CoroTask { + if (!files_to_process.empty()) { + { + ScopedTimer _t(stages, "index_and_aggregate"); + auto batch_result = co_await batch_index_and_aggregate( + &scope, files_to_process, index_dir, checkpoint_size, + force_rebuild, executor_threads, agg_config, agg_db, + config_hash); + + { + ScopedTimer _vd(stages, "visitor_drain"); + for (auto& file_visitors : + batch_result.extra_visitors) { + for (auto& visitor : file_visitors) { + auto* agg_visitor = + dynamic_cast( + visitor.get()); + if (agg_visitor) { + for (const auto& k : + agg_visitor->observed_extra_keys()) + merger->add_observed_extra_key(k); + for (const auto& m : + agg_visitor->observed_custom_metrics()) + merger->add_observed_custom_metric(m); + auto output = agg_visitor->take_output(); + merger->merge_chunk(std::move(output)); + } } + file_visitors.clear(); } - - co_return; - }); + } } - // Chunk workers: parallel aggregation - for (std::size_t w = 0; w < executor_threads; ++w) { - (void)w; - scope.spawn( - [chunk_chan, rp = result_chan->producer(), result_chan]( - CoroScope& wctx) mutable -> coro::CoroTask { - [[maybe_unused]] auto producer_guard = rp.guard(); - while (auto input = - co_await wctx.receive(chunk_chan)) { - ChunkAggregatorUtility agg; - auto output = co_await agg.process(*input); - if (!co_await result_chan->send( - std::move(output))) { - co_return; - } - } - co_return; - }); + // Write tracking entries for processed files + { + ScopedTimer _wt(stages, "write_tracking"); + write_aggregation_tracking(agg_db.get(), agg_config, + files_to_process, + shared_index_path); } - - // Streaming merger: incremental merge - auto* merger_ptr = &merger; - scope.spawn([result_chan, merger_ptr]( - CoroScope& mctx) -> coro::CoroTask { - while (auto output = co_await mctx.receive(result_chan)) { - merger_ptr->merge_chunk(std::move(*output)); - } - co_return; - }); - - co_return; - }); - - co_return; - }, - "StreamingAggregate"); - - // Post-processing: finalize, resolve associations, write output - bool write_success = false; - EventAggregatorUtilityOutput agg_results; - - auto post_task = make_task( - [&](CoroScope& /*ctx*/) -> coro::CoroTask { - agg_results = merger.finalize(); - - // Resolve associations - AssociationResolverInput resolver_input; - resolver_input.trackers = std::move(agg_results.trackers); - resolver_input.aggregations = std::move(agg_results); - resolver_input.config = agg_config; - - AssociationResolverUtility resolver; - auto resolver_output = - co_await resolver.process(std::move(resolver_input)); - agg_results = std::move(resolver_output.aggregations); - - if (agg_results.aggregations.empty()) { - DFTRACER_UTILS_LOG_WARN("No aggregations to write!"); - co_return false; } - bool success = false; + ScopedTimer _pp(stages, "post_processing"); #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC if (output_format == AggregationConfig::FORMAT_ARROW) { using namespace utilities::common::arrow; - DFTRACER_UTILS_LOG_INFO( - "Writing %zu aggregation keys to %s (Arrow IPC)...", - agg_results.aggregations.size(), output_file.c_str()); + std::unique_ptr global_tracker; + { + ScopedTimer _bt(stages, "build_global_tracker"); + global_tracker = merger->build_global_tracker(); + } + (void)global_tracker; + + EventAggregator::ObservedColumns obs; + { + ScopedTimer _oc(stages, "observed_columns"); + obs = merger->observed_columns(); + } + auto& global_extra_key_ids = obs.extra_key_ids; + auto& global_custom_metric_names = obs.custom_metric_names; IpcWriter ipc; - if (ipc.open(output_file) != 0) { + if (co_await ipc.open(output_file) != 0) { DFTRACER_UTILS_LOG_ERROR( "Failed to open Arrow IPC file: %s", output_file.c_str()); - co_return false; - } + } else { + ScopedTimer _aw(stages, "arrow_scan_write"); + constexpr std::size_t BATCH_ROWS = 10000; + AggregationBatch batch; + batch.entries.reserve(BATCH_ROWS); + batch.global_extra_key_ids = &global_extra_key_ids; + batch.global_custom_metric_names = + &global_custom_metric_names; + + std::vector pending_batches; + merger->scan([&](AggMapType, const AggregationKey& key, + AggregationMetrics& metrics) { + total_keys++; + batch.entries.emplace_back(key, std::move(metrics)); + if (batch.entries.size() >= BATCH_ROWS) { + pending_batches.push_back(batch.to_arrow()); + batch.entries.clear(); + } + return true; + }); + if (!batch.entries.empty()) { + pending_batches.push_back(batch.to_arrow()); + } - constexpr std::size_t BATCH_ROWS = 10000; - AggregationBatch batch; - batch.entries.reserve(BATCH_ROWS); - - bool arrow_write_failed = false; - for (auto& [key, metrics] : agg_results.aggregations) { - batch.entries.emplace_back(key, metrics); - if (batch.entries.size() >= BATCH_ROWS) { - auto arrow_batch = batch.to_arrow(); - if (ipc.write_batch(arrow_batch) != 0) { - DFTRACER_UTILS_LOG_ERROR( - "Arrow IPC write_batch failed"); - arrow_write_failed = true; + write_success = true; + for (auto& ab : pending_batches) { + if (co_await ipc.write_batch(ab) != 0) { + write_success = false; break; } - batch.entries.clear(); } - } - if (arrow_write_failed) { - ipc.close(); - co_return false; - } - if (!batch.entries.empty()) { - auto arrow_batch = batch.to_arrow(); - if (ipc.write_batch(arrow_batch) != 0) { - DFTRACER_UTILS_LOG_ERROR( - "Arrow IPC write_batch (final) failed"); - ipc.close(); - co_return false; + if (write_success) { + write_success = (co_await ipc.close() == 0); + } else { + co_await ipc.close(); } } - - success = (ipc.close() == 0); } else #endif { - // JSON / Perfetto output path - DFTRACER_UTILS_LOG_INFO( - "Writing %zu aggregation keys to %s%s...", - agg_results.aggregations.size(), output_file.c_str(), - compress_output ? " (compressed)" : ""); - - PerfettoTraceWriterUtility writer; - PerfettoTraceWriterInput writer_input{ - output_file, - std::move(resolver_output), - agg_config.compute_statistics, - agg_config.compute_percentiles, - agg_config.percentiles, - compress_output, - compression_level, - event_format}; - success = co_await writer.process(writer_input); - } - - if (success) { - DFTRACER_UTILS_LOG_INFO("Output written successfully to: %s", - output_file.c_str()); - if (fs::exists(output_file)) { - auto file_size = fs::file_size(output_file); - DFTRACER_UTILS_LOG_INFO("File exists, size: %zu bytes", - file_size); - } else { - DFTRACER_UTILS_LOG_ERROR( - "File does not exist after write!"); - success = false; + PerfettoTraceWriterInput streaming_input; + { + ScopedTimer _si(stages, "build_streaming_input"); + streaming_input = build_streaming_input( + merger.get(), &agg_config, &output_file, + compress_output, compression_level, event_format); + streaming_input.keys_written = &perfetto_keys_written; + streaming_input.merge_on_sharded = true; } - } else { - DFTRACER_UTILS_LOG_ERROR("Failed to write output file"); + { + ScopedTimer _pw(stages, "perfetto_write"); + PerfettoTraceWriterUtility writer; + write_success = co_await scope.spawn( + writer, std::move(streaming_input)); + } + total_keys = perfetto_keys_written.load(); } - - write_success = success; - co_return success; }, - "PostProcess"); + "AggregatorMain"); - post_task->depends_on(streaming_task); - pipeline.set_source(streaming_task); - pipeline.set_destination(post_task); - pipeline.execute(); + pipeline.set_source(main_task); + { + ScopedTimer _t(stages, "pipeline_execute"); + pipeline.execute(); + } + + { + ScopedTimer _t(stages, "close_rocksdb"); + merger.reset(); + agg_db.reset(); + } - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = end_time - start_time; + overall.stop(); + double duration_ms = static_cast(overall.elapsed()) / 1e6; std::printf("\n"); std::printf("==========================================\n"); std::printf("Aggregation Results\n"); std::printf("==========================================\n"); - std::printf(" Execution time: %.2f seconds\n", duration.count() / 1000.0); - std::printf(" Files processed: %zu\n", agg_results.total_files_processed); - std::printf(" Bytes processed: %.2f MB\n", - static_cast(agg_results.total_bytes_processed) / - (1024.0 * 1024.0)); - std::printf(" Events processed: %zu\n", - agg_results.total_events_processed); - std::printf(" Unique aggregation keys: %zu\n", - agg_results.aggregations.size()); - std::printf(" Throughput: %.2f MB/s, %.2f events/s\n", - (static_cast(agg_results.total_bytes_processed) / - (1024.0 * 1024.0)) / - (duration.count() / 1000.0), - static_cast(agg_results.total_events_processed) / - (duration.count() / 1000.0)); + std::printf(" Execution time: %.2f seconds\n", duration_ms / 1000.0); + std::printf(" Files: %zu total, %zu processed, %zu cached\n", + input_files.size(), files_to_process.size(), num_cached); + std::printf(" Unique aggregation keys: %zu\n", total_keys); std::printf(" Output file: %s\n", output_file.c_str()); std::printf(" Write status: %s\n", write_success ? "SUCCESS" : "FAILED"); std::printf("==========================================\n"); - AggregatorSummaryUtility summary_writer; - summary_writer.process(agg_results); + if (stages) stages->print_stages(); - if (!temp_index_dir.empty() && fs::exists(temp_index_dir)) { - DFTRACER_UTILS_LOG_INFO("Cleaning up temporary index directory: %s", - temp_index_dir.c_str()); - fs::remove_all(temp_index_dir); - } - - co_return agg_results.success&& write_success ? 0 : 1; + co_return write_success ? 0 : 1; } int main(int argc, char** argv) { DFTRACER_UTILS_LOGGER_INIT(); - auto default_checkpoint_size_str = - std::to_string(indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE) + - " B (" + - std::to_string(indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE / - (1024 * 1024)) + - " MB)"; - argparse::ArgumentParser program("dftracer_aggregator", DFTRACER_UTILS_PACKAGE_VERSION); program.add_description( "Aggregate DFTracer events into time-series counters using streaming " "coroutine pipeline with minimal memory footprint"); - program.add_argument("-d", "--directory") - .help("Input directory containing .pfw or .pfw.gz files") - .default_value("."); - - program.add_argument("-o", "--output") - .help("Output file path for aggregated counters") - .default_value("aggregated_output.json"); - - program.add_argument("-t", "--time-interval") - .help("Time interval in milliseconds for bucketing (default: 5000)") - .scan<'g', double>() - .default_value(5000.0); - - program.add_argument("-g", "--group-keys") - .help( - "Comma-separated extra group keys from args (e.g., " - "epoch,step,level)") - .default_value(""); - - program.add_argument("-m", "--metric-fields") - .help( - "Comma-separated custom metric fields from args (e.g., " - "iter_count,num_events)") - .default_value(""); - - program.add_argument("--query") - .help("Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')") - .default_value(""); - - program.add_argument("-f", "--force").help("Force index recreation").flag(); - - program.add_argument("--checkpoint-size") - .help("Checkpoint size for indexing in bytes (default: " + - default_checkpoint_size_str + ")") - .scan<'d', std::size_t>() - .default_value(static_cast( - indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE)); - - program.add_argument("--executor-threads") - .help( - "Number of executor threads for parallel processing (default: " - "number of CPU cores)") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - program.add_argument("--index-dir") - .help("Directory to store index files (default: system temp directory)") - .default_value(""); - - program.add_argument("--compress") - .help("Compress output using gzip") - .default_value(false) - .implicit_value(true); - - program.add_argument("--compression-level") - .help("Gzip compression level (0-9, default: 6)") - .scan<'d', int>() - .default_value(6); - - program.add_argument("--boundary-events") - .help( - "Boundary event configuration: event_name:value_field:output_name " - "(e.g., \"epoch.block:iter_count:epoch\")") - .default_value(""); - - program.add_argument("--no-track-process-parents") - .help( - "Disable tracking of process parent relationships from fork/spawn") - .default_value(false) - .implicit_value(true); - - program.add_argument("--chunk-size") - .help("Target chunk size in MB for parallel processing (default: 4)") - .scan<'d', std::size_t>() - .default_value(static_cast(4)); - - program.add_argument("--read-batch-size") - .help( - "Batch read size in MB for stream processing (default: 4, higher = " - "faster but more memory)") - .scan<'d', std::size_t>() - .default_value(static_cast(4)); - - program.add_argument("--event-format") - .help( - "Perfetto event format: 'counter' (ph=C, point-in-time, default), " - "'async' (ph=b/e, async tracks with overlaps), " - "'regular' (ph=X, duration events with original TID)") - .default_value("counter"); - - program.add_argument("--compute-percentiles") - .help( - "Enable percentile/quantile computation using DDSketch (opt-in due " - "to memory overhead)") - .default_value(false) - .implicit_value(true); - - program.add_argument("--percentiles") - .help( - "Comma-separated percentiles to compute (e.g., " - "\"0.25,0.5,0.75,0.90\" for P25, P50, P75, P90)") - .default_value("0.25,0.5,0.75,0.90"); - - program.add_argument("--relative-accuracy") - .help( - "Relative accuracy for DDSketch percentile estimation " - "(default: 0.01 = 1%)") - .scan<'g', double>() - .default_value(0.01); - - program.add_argument("--format") - .help( - "Output format: 'json' (Perfetto JSON, default) or " - "'arrow' (Arrow IPC file, .arrows extension)") - .default_value("json"); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what()); - std::fprintf(stderr, "%s\n", program.help().str().c_str()); - return 1; - } + AggregatorArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; - return run_aggregator(program).get(); + return run_aggregator(&cli).get(); } diff --git a/src/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp b/src/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp new file mode 100644 index 00000000..f482a80b --- /dev/null +++ b/src/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp @@ -0,0 +1,1199 @@ +// MPI driver for the distributed-SST aggregator. +// +// Pipeline DAG: +// scan -> phase_a -> phase_b -> phase_c -> merge +// Each stage is its own task wired via depends_on(); MPI collectives +// between stages sit inside the task bodies. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common_cli.h" + +using namespace dftracer::utils; +using namespace dftracer::utils::utilities; +using dftracer::utils::utilities::composites::dft::aggregators:: + AGG_KEY_NUM_SHARDS; +using dftracer::utils::utilities::composites::dft::aggregators:: + AggregationConfig; +using dftracer::utils::utilities::composites::dft::aggregators:: + AggregationVisitor; +using dftracer::utils::utilities::composites::dft::aggregators:: + AssociationTracker; +using dftracer::utils::utilities::composites::dft::aggregators::EventAggregator; +using dftracer::utils::utilities::composites::dft::aggregators:: + PerfettoEventFormat; +using dftracer::utils::utilities::composites::dft::aggregators:: + PerfettoTraceWriterInput; +using dftracer::utils::utilities::composites::dft::aggregators:: + PerfettoTraceWriterUtility; +using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBatchSink; +using dftracer::utils::utilities::indexer::IndexBuildBatchConfig; +using dftracer::utils::utilities::indexer::IndexDatabase; +using dftracer::utils::utilities::indexer::IndexDatabaseSstWriterContext; +using dftracer::utils::utilities::indexer::SstArtifactRegistry; +using dftracer::utils::utilities::indexer::internal:: + enumerate_gzip_member_candidates; +using dftracer::utils::utilities::indexer::internal::GzipMember; + +namespace { + +class AggregatorMpiArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{ + cli::DirMode::DEFAULT_DOT, + "Input directory containing .pfw or .pfw.gz files"}; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + + std::string output; + std::string staging_dir; + std::string shared_staging_dir; + double time_interval = 5000.0; + bool keep_staging = false; + + explicit AggregatorMpiArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.index_dir_help = + "Directory to store the final index (shared across ranks)"; + indexing.force_help = "Force index recreation"; + schema(directory, pipeline, indexing); + } + + protected: + void register_args() override { + parser() + .add_argument("-o", "--output") + .help("Output file path for aggregated counters (gzip JSON)") + .default_value("aggregated_output.json.gz"); + + parser() + .add_argument("--staging-dir") + .help( + "Per-rank SST staging root. Defaults to /_staging. " + "Each rank writes to /rank_.") + .default_value(""); + + parser() + .add_argument("--shared-staging") + .help( + "Shared-FS staging root. When set and different from " + "--staging-dir, each rank moves its SSTs + tracker.bin from " + "the (node-local) staging dir to /rank_ " + "before the coordinator ingest. Required for multi-node runs " + "where --staging-dir points at node-local NVMe.") + .default_value(""); + + parser() + .add_argument("-t", "--time-interval") + .help("Time interval in milliseconds for bucketing (default: 5000)") + .scan<'g', double>() + .default_value(5000.0); + + parser() + .add_argument("--keep-staging") + .help("Keep per-rank SST staging dirs after successful ingest") + .default_value(false) + .implicit_value(true); + } + + void post_parse() override { + output = parser().get("--output"); + staging_dir = parser().get("--staging-dir"); + shared_staging_dir = parser().get("--shared-staging"); + time_interval = parser().get("--time-interval"); + keep_staging = parser().get("--keep-staging"); + } +}; + +std::vector enumerate_inputs(const std::string& dir) { + std::vector files; + std::error_code ec; + for (const auto& entry : fs::directory_iterator(dir, ec)) { + if (ec) break; + if (!entry.is_regular_file(ec)) continue; + const auto& p = entry.path(); + const auto ext = p.extension().string(); + if (ext == ".pfw" || ext == ".gz") files.push_back(p.string()); + } + std::sort(files.begin(), files.end()); + return files; +} + +std::vector pack_paths(const std::vector& paths) { + std::uint64_t total = sizeof(std::uint64_t); + for (const auto& p : paths) total += sizeof(std::uint64_t) + p.size(); + std::vector buf; + buf.reserve(total); + auto u64 = [&](std::uint64_t v) { + buf.insert(buf.end(), reinterpret_cast(&v), + reinterpret_cast(&v) + sizeof(v)); + }; + u64(paths.size()); + for (const auto& p : paths) { + u64(p.size()); + buf.insert(buf.end(), p.begin(), p.end()); + } + return buf; +} + +std::vector unpack_paths(const std::vector& buf) { + std::vector paths; + if (buf.size() < sizeof(std::uint64_t)) return paths; + const char* p = buf.data(); + const char* end = buf.data() + buf.size(); + auto read_u64 = [&](std::uint64_t& out) -> bool { + if (end - p < static_cast(sizeof(out))) return false; + std::memcpy(&out, p, sizeof(out)); + p += sizeof(out); + return true; + }; + std::uint64_t n = 0; + if (!read_u64(n)) return paths; + paths.reserve(n); + for (std::uint64_t i = 0; i < n; ++i) { + std::uint64_t len = 0; + if (!read_u64(len)) break; + if (end - p < static_cast(len)) break; + paths.emplace_back(p, p + len); + p += len; + } + return paths; +} + +void pack_artifacts(const IndexDatabaseSstWriterContext::Artifacts& a, + std::vector& buf) { + auto append_u64 = [&](std::uint64_t v) { + buf.insert(buf.end(), reinterpret_cast(&v), + reinterpret_cast(&v) + sizeof(v)); + }; + auto append_opt = [&](const std::optional& s) { + if (s) { + buf.push_back(1); + append_u64(s->size()); + buf.insert(buf.end(), s->begin(), s->end()); + } else { + buf.push_back(0); + } + }; + append_opt(a.metadata_sst); + append_opt(a.checkpoints_sst); + append_opt(a.manifest_sst); + append_opt(a.chunk_bloom_sst); + append_opt(a.file_bloom_sst); + append_opt(a.chunk_stats_sst); + append_opt(a.chunk_dim_stats_sst); + append_opt(a.dimensions_sst); + append_opt(a.file_scalar_stats_sst); + append_opt(a.file_cat_counts_sst); + append_opt(a.file_pid_tid_counts_sst); + append_opt(a.file_name_counts_sst); + append_opt(a.name_dictionary_sst); + append_opt(a.name_file_postings_sst); + append_opt(a.name_chunk_postings_sst); + append_opt(a.hash_tables_sst); + append_opt(a.aggregation_sst); + append_opt(a.system_metrics_sst); +} + +bool unpack_artifacts(const char*& p, const char* end, + IndexDatabaseSstWriterContext::Artifacts& a) { + auto read_u64 = [&](std::uint64_t& out) -> bool { + if (end - p < static_cast(sizeof(out))) return false; + std::memcpy(&out, p, sizeof(out)); + p += sizeof(out); + return true; + }; + auto read_opt = [&](std::optional& s) -> bool { + if (p == end) return false; + const char flag = *p++; + if (!flag) return true; + std::uint64_t len = 0; + if (!read_u64(len)) return false; + if (end - p < static_cast(len)) return false; + s = std::string(p, p + len); + p += len; + return true; + }; + return read_opt(a.metadata_sst) && read_opt(a.checkpoints_sst) && + read_opt(a.manifest_sst) && read_opt(a.chunk_bloom_sst) && + read_opt(a.file_bloom_sst) && read_opt(a.chunk_stats_sst) && + read_opt(a.chunk_dim_stats_sst) && read_opt(a.dimensions_sst) && + read_opt(a.file_scalar_stats_sst) && + read_opt(a.file_cat_counts_sst) && + read_opt(a.file_pid_tid_counts_sst) && + read_opt(a.file_name_counts_sst) && + read_opt(a.name_dictionary_sst) && + read_opt(a.name_file_postings_sst) && + read_opt(a.name_chunk_postings_sst) && read_opt(a.hash_tables_sst) && + read_opt(a.aggregation_sst) && read_opt(a.system_metrics_sst); +} + +std::vector pack_artifact_list( + const std::vector& main_artifacts, + const std::vector& + agg_artifacts) { + std::vector buf; + std::uint64_t count = 0; + for (const auto& a : main_artifacts) + if (!a.empty()) ++count; + for (const auto& a : agg_artifacts) + if (!a.empty()) ++count; + buf.insert(buf.end(), reinterpret_cast(&count), + reinterpret_cast(&count) + sizeof(count)); + for (const auto& a : main_artifacts) + if (!a.empty()) pack_artifacts(a, buf); + for (const auto& a : agg_artifacts) + if (!a.empty()) pack_artifacts(a, buf); + return buf; +} + +bool append_artifact_list(const char* p, const char* end, + SstArtifactRegistry& registry) { + if (end - p < static_cast(sizeof(std::uint64_t))) + return false; + std::uint64_t count = 0; + std::memcpy(&count, p, sizeof(count)); + p += sizeof(count); + for (std::uint64_t i = 0; i < count; ++i) { + IndexDatabaseSstWriterContext::Artifacts a; + if (!unpack_artifacts(p, end, a)) return false; + registry.append(std::move(a)); + } + return true; +} + +coro::CoroTask scan_one_file(const std::string& path, + std::vector& out) { + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) co_return; + struct stat st; + if (::fstat(fd, &st) == 0 && st.st_size >= 18) { + co_await enumerate_gzip_member_candidates( + fd, static_cast(st.st_size), out); + } + ::close(fd); +} + +struct WorkUnit { + std::size_t file_idx; + std::size_t member_begin; + std::size_t member_end; + std::uint64_t c_size; +}; + +// Partition members into slices of ~target bytes. Deterministic across +// ranks so every rank computes identical assignments from the same map. +std::vector build_work_units( + const std::vector>& per_file_members, + std::uint64_t target_c_size) { + std::vector units; + for (std::size_t fi = 0; fi < per_file_members.size(); ++fi) { + const auto& members = per_file_members[fi]; + if (members.empty()) continue; + std::size_t begin = 0; + std::uint64_t accum = 0; + for (std::size_t i = 0; i < members.size(); ++i) { + accum += members[i].c_size; + const bool is_last = (i + 1 == members.size()); + if ((target_c_size > 0 && accum >= target_c_size) || is_last) { + units.push_back({fi, begin, i + 1, accum}); + begin = i + 1; + accum = 0; + } + } + } + return units; +} + +std::vector lpt_assign_units(const std::vector& units, + int num_ranks) { + std::vector order(units.size()); + for (std::size_t i = 0; i < order.size(); ++i) order[i] = i; + std::sort(order.begin(), order.end(), [&](std::size_t a, std::size_t b) { + if (units[a].c_size != units[b].c_size) + return units[a].c_size > units[b].c_size; + if (units[a].file_idx != units[b].file_idx) + return units[a].file_idx < units[b].file_idx; + return units[a].member_begin < units[b].member_begin; + }); + std::vector loads(num_ranks, 0); + std::vector owner(units.size(), 0); + for (std::size_t ord : order) { + int best = 0; + for (int r = 1; r < num_ranks; ++r) + if (loads[r] < loads[best]) best = r; + owner[ord] = best; + loads[best] += std::max(units[ord].c_size, 1); + } + return owner; +} + +// Shared state threaded through the DAG via reference capture. +struct RunCtx { + int rank = 0; + int size = 1; + const AggregatorMpiArgParse* cli = nullptr; + + std::string index_dir; + std::string staging_root; + std::string shared_staging_root; + std::string final_output; + std::string perfetto_shards_dir; + std::string my_shard_output; + + std::vector all_files; + std::uint64_t nfiles = 0; + std::vector> member_map; + + std::vector my_files; + std::vector my_file_ids; + std::vector my_slices; + + std::vector main_artifacts; + std::vector agg_artifacts; + + bool failed = false; + double scan_ms = 0.0; + double phase_a_ms = 0.0; + double phase_b_ms = 0.0; + double phase_c_ms = 0.0; +}; + +// ---- scan task: co-operative gzip member pre-scan + LPT assignment ---- +coro::CoroTask task_scan(RunCtx& ctx, CoroScope& scope) { + if (ctx.failed) co_return; + const auto t0 = std::chrono::steady_clock::now(); + + std::vector my_scan_indices; + for (std::uint64_t i = 0; i < ctx.nfiles; ++i) { + if (static_cast(i % static_cast(ctx.size)) == + ctx.rank) { + my_scan_indices.push_back(i); + } + } + std::vector> my_scans(my_scan_indices.size()); + + co_await scope.scope([&](CoroScope& child) -> coro::CoroTask { + for (std::size_t si = 0; si < my_scan_indices.size(); ++si) { + const std::string& path = ctx.all_files[my_scan_indices[si]]; + auto& out = my_scans[si]; + child.spawn([path, &out](CoroScope&) -> coro::CoroTask { + co_await scan_one_file(path, out); + }); + } + co_return; + }); + + std::vector my_packed; + auto u64 = [&](std::uint64_t v) { + my_packed.insert(my_packed.end(), reinterpret_cast(&v), + reinterpret_cast(&v) + sizeof(v)); + }; + u64(my_scan_indices.size()); + for (std::size_t si = 0; si < my_scan_indices.size(); ++si) { + u64(static_cast(my_scan_indices[si])); + u64(static_cast(my_scans[si].size())); + for (const auto& m : my_scans[si]) { + u64(m.c_offset); + u64(m.c_size); + } + } + const int my_bytes = static_cast(my_packed.size()); + std::vector rank_bytes(ctx.size, 0); + MPI_Allgather(&my_bytes, 1, MPI_INT, rank_bytes.data(), 1, MPI_INT, + MPI_COMM_WORLD); + std::vector displs(ctx.size, 0); + int total = 0; + for (int r = 0; r < ctx.size; ++r) { + displs[r] = total; + total += rank_bytes[r]; + } + std::vector gathered(total); + MPI_Allgatherv(my_packed.data(), my_bytes, MPI_CHAR, gathered.data(), + rank_bytes.data(), displs.data(), MPI_CHAR, MPI_COMM_WORLD); + + ctx.member_map.assign(ctx.nfiles, {}); + for (int r = 0; r < ctx.size; ++r) { + const char* p = gathered.data() + displs[r]; + const char* end = p + rank_bytes[r]; + auto read_u64 = [&](std::uint64_t& v) -> bool { + if (end - p < static_cast(sizeof(v))) return false; + std::memcpy(&v, p, sizeof(v)); + p += sizeof(v); + return true; + }; + std::uint64_t count = 0; + if (!read_u64(count)) continue; + for (std::uint64_t k = 0; k < count; ++k) { + std::uint64_t fi = 0, mc = 0; + if (!read_u64(fi) || !read_u64(mc)) break; + if (fi >= ctx.nfiles) break; + ctx.member_map[fi].resize(mc); + for (std::uint64_t j = 0; j < mc; ++j) { + if (!read_u64(ctx.member_map[fi][j].c_offset)) break; + if (!read_u64(ctx.member_map[fi][j].c_size)) break; + } + } + } + + // Fallback for plain .pfw / unreadable / non-dftracer gzip. + std::uint64_t total_c = 0; + for (std::uint64_t i = 0; i < ctx.nfiles; ++i) { + if (ctx.member_map[i].empty()) { + std::error_code ec; + std::uint64_t sz = fs::file_size(ctx.all_files[i], ec); + if (ec) sz = 0; + ctx.member_map[i].push_back({0, sz}); + } + for (const auto& m : ctx.member_map[i]) total_c += m.c_size; + } + + const std::uint64_t target_per_rank = + (total_c + static_cast(ctx.size) - 1) / + std::max(static_cast(ctx.size), 1); + const auto units = build_work_units(ctx.member_map, target_per_rank); + const auto owner = lpt_assign_units(units, ctx.size); + + for (std::size_t ui = 0; ui < units.size(); ++ui) { + if (owner[ui] != ctx.rank) continue; + const auto& u = units[ui]; + ctx.my_files.push_back(ctx.all_files[u.file_idx]); + ctx.my_file_ids.push_back(static_cast(u.file_idx + 1)); + IndexBuildBatchConfig::FileSlice s; + s.members = &ctx.member_map[u.file_idx]; + s.member_begin = u.member_begin; + s.member_end = u.member_end; + // Disambiguate (file_id, checkpoint_idx) across slices. + constexpr std::uint64_t CKPT_STRIDE = 1u << 20; + s.checkpoint_idx_base = + static_cast(u.member_begin) * CKPT_STRIDE; + // Only the first slice of a file persists file-scoped CFs. + s.skip_file_scoped_writes = (u.member_begin != 0); + ctx.my_slices.push_back(s); + } + + ctx.scan_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + if (ctx.rank == 0) { + std::printf( + "[rank 0] pre-scan %.2f ms: files=%llu work_units=%zu " + "target_per_rank=%llu bytes total=%llu bytes\n", + ctx.scan_ms, static_cast(ctx.nfiles), + units.size(), static_cast(target_per_rank), + static_cast(total_c)); + } + std::printf("[rank %d/%d] files=%zu (work_units)\n", ctx.rank, ctx.size, + ctx.my_files.size()); + std::fflush(stdout); + co_return; +} + +// ---- phase_a task: distributed-SST index + aggregate build ---- +coro::CoroTask task_phase_a(RunCtx& ctx, CoroScope& scope) { + if (ctx.failed) co_return; + + const auto t0 = std::chrono::steady_clock::now(); + bool ok = true; + + if (!ctx.my_files.empty()) { + const std::string rank_staging = + (fs::path(ctx.staging_root) / ("rank_" + std::to_string(ctx.rank))) + .string(); + std::error_code ec; + fs::create_directories(rank_staging, ec); + + auto agg_config = std::make_shared(); + agg_config->time_interval_us = + static_cast(ctx.cli->time_interval * 1000.0); + agg_config->compute_statistics = true; + agg_config->track_process_parents = true; + agg_config->track_default_args = true; + + // Atomic: write_phase spawns N concurrent write workers; a plain + // size_t would let two workers share an idx and clobber each + // other's SSTs ("Bad table magic number" at ingest). + auto batch_counter = std::make_shared>(0); + struct SharedArtifacts { + std::mutex mu; + std::vector list; + }; + auto artifacts_shared = std::make_shared(); + + auto batch_config = std::make_shared(); + batch_config->file_paths = ctx.my_files; + batch_config->preassigned_file_ids = ctx.my_file_ids; + batch_config->file_slices = ctx.my_slices; + batch_config->index_dir = ctx.index_dir; + batch_config->checkpoint_size = ctx.cli->indexing.checkpoint_size; + batch_config->force_rebuild = ctx.cli->indexing.force; + batch_config->build_manifest = false; + batch_config->parallelism = ctx.cli->pipeline.executor_threads; + batch_config->rebuild_root_summaries = false; + + const std::string batch_id = "r" + std::to_string(ctx.rank); + batch_config->dft_visitor_factory = + [rank_staging, batch_id, agg_config](const std::string& file_path) + -> std::vector> { + std::vector> v; + v.push_back(std::make_unique( + rank_staging, batch_id + "_agg", 0, *agg_config, file_path)); + return v; + }; + batch_config->sink_factory = + [rank_staging, batch_id, + batch_counter]() -> std::unique_ptr { + const std::size_t idx = + batch_counter->fetch_add(1, std::memory_order_relaxed); + return std::make_unique( + rank_staging, batch_id + "_" + std::to_string(idx)); + }; + batch_config->sink_commit = [artifacts_shared](IndexBatchSink& sink) { + auto& sst = static_cast(sink); + auto a = sst.commit(); + std::lock_guard lock(artifacts_shared->mu); + if (!a.empty()) artifacts_shared->list.push_back(std::move(a)); + }; + + auto batch_result = + co_await IndexBatchBuilderUtility::process(&scope, batch_config); + + if (batch_result.failed > 0) { + for (const auto& r : batch_result.results) { + if (!r.success) { + std::fprintf( + stderr, "[rank %d] build failed: %s (file=%s)\n", + ctx.rank, r.error_message.c_str(), r.file_path.c_str()); + ok = false; + break; + } + } + } + + if (ok) { + { + std::lock_guard lock(artifacts_shared->mu); + ctx.main_artifacts = std::move(artifacts_shared->list); + } + + std::vector seen; + for (auto& file_visitors : batch_result.extra_visitors) { + for (auto& v : file_visitors) { + auto* agg = dynamic_cast(v.get()); + if (!agg) continue; + if (std::find(seen.begin(), seen.end(), agg) != seen.end()) + continue; + seen.push_back(agg); + for (auto& a : agg->aggregation_artifacts()) { + if (!a.empty()) + ctx.agg_artifacts.push_back(std::move(a)); + } + } + } + + AssociationTracker combined; + for (auto* agg : seen) { + auto out = agg->take_output(); + if (out.local_tracker) combined.merge(*out.local_tracker); + } + combined.finalize(); + const std::string serialized = combined.serialize(); + const std::string tracker_local = + (fs::path(rank_staging) / "tracker.bin").string(); + FILE* f = std::fopen(tracker_local.c_str(), "wb"); + if (f) { + std::fwrite(serialized.data(), 1, serialized.size(), f); + std::fclose(f); + } + + // Move per-rank artifacts from node-local staging to shared + // staging so rank 0 can ingest them from a path visible on + // every node. No-op when the two roots are the same. + if (ctx.shared_staging_root != ctx.staging_root) { + const std::string rank_shared = + (fs::path(ctx.shared_staging_root) / + ("rank_" + std::to_string(ctx.rank))) + .string(); + std::error_code mec; + fs::create_directories(rank_shared, mec); + try { + for (std::size_t i = 0; i < ctx.main_artifacts.size(); + ++i) { + const std::string sub = (fs::path(rank_shared) / + ("main_" + std::to_string(i))) + .string(); + ctx.main_artifacts[i] = + std::move(ctx.main_artifacts[i]).move_to(sub); + } + for (std::size_t i = 0; i < ctx.agg_artifacts.size(); ++i) { + const std::string sub = (fs::path(rank_shared) / + ("agg_" + std::to_string(i))) + .string(); + ctx.agg_artifacts[i] = + std::move(ctx.agg_artifacts[i]).move_to(sub); + } + } catch (const std::exception& e) { + std::fprintf(stderr, + "[rank %d] failed to relocate SSTs to shared " + "staging: %s\n", + ctx.rank, e.what()); + ok = false; + } + if (ok && !serialized.empty()) { + const std::string tracker_shared = + (fs::path(rank_shared) / "tracker.bin").string(); + std::error_code tec; + fs::rename(tracker_local, tracker_shared, tec); + if (tec) { + fs::copy_file(tracker_local, tracker_shared, + fs::copy_options::overwrite_existing, + tec); + if (!tec) fs::remove(tracker_local, tec); + } + } + } + } + } + + int ok_int = ok ? 1 : 0, global = 0; + MPI_Allreduce(&ok_int, &global, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + if (!global) { + if (ctx.rank == 0) + std::fprintf(stderr, "Phase A failed on some rank\n"); + ctx.failed = true; + co_return; + } + + ctx.phase_a_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + std::printf( + "[rank %d/%d] Phase A done in %.2f ms: main_artifacts=%zu " + "agg_flushes=%zu\n", + ctx.rank, ctx.size, ctx.phase_a_ms, ctx.main_artifacts.size(), + ctx.agg_artifacts.size()); + std::fflush(stdout); + co_return; +} + +// ---- phase_b task: Gatherv + rank 0 bulk_ingest + tracker merge ---- +coro::CoroTask task_phase_b(RunCtx& ctx) { + if (ctx.failed) co_return; + + const auto t0 = std::chrono::steady_clock::now(); + const std::vector packed = + pack_artifact_list(ctx.main_artifacts, ctx.agg_artifacts); + const int my_bytes = static_cast(packed.size()); + + std::vector rank_bytes(ctx.size, 0); + MPI_Gather(&my_bytes, 1, MPI_INT, rank_bytes.data(), 1, MPI_INT, 0, + MPI_COMM_WORLD); + + std::vector displs(ctx.size, 0); + std::vector gathered; + if (ctx.rank == 0) { + int total = 0; + for (int r = 0; r < ctx.size; ++r) { + displs[r] = total; + total += rank_bytes[r]; + } + gathered.resize(total); + } + MPI_Gatherv(packed.data(), my_bytes, MPI_CHAR, + ctx.rank == 0 ? gathered.data() : nullptr, rank_bytes.data(), + displs.data(), MPI_CHAR, 0, MPI_COMM_WORLD); + + int ok = 1; + if (ctx.rank == 0) { + try { + SstArtifactRegistry registry; + for (int r = 0; r < ctx.size; ++r) { + if (rank_bytes[r] == 0) continue; + const char* p = gathered.data() + displs[r]; + if (!append_artifact_list(p, p + rank_bytes[r], registry)) { + std::fprintf( + stderr, + "[rank 0] failed to parse artifacts from rank %d\n", r); + ok = 0; + break; + } + } + if (ok) { + IndexDatabase db(ctx.index_dir); + db.bulk_ingest(registry, {}); + db.rebuild_root_summaries(); + + db.write_agg_global_config(static_cast( + ctx.cli->time_interval * 1000.0)); + std::vector all_file_ids; + all_file_ids.reserve(ctx.nfiles); + for (std::uint64_t i = 1; i <= ctx.nfiles; ++i) + all_file_ids.push_back(static_cast(i)); + db.write_agg_file_markers(all_file_ids); + + AssociationTracker unified; + for (int r = 0; r < ctx.size; ++r) { + char suffix[32]; + std::snprintf(suffix, sizeof(suffix), + "/rank_%d/tracker.bin", r); + std::ifstream f(ctx.shared_staging_root + suffix, + std::ios::binary); + if (!f) continue; + std::string bytes((std::istreambuf_iterator(f)), {}); + if (!bytes.empty()) + unified.merge(AssociationTracker::deserialize(bytes)); + } + unified.finalize(); + constexpr std::string_view TRACKER_KEY = "__tracker__"; + db.db()->put(TRACKER_KEY, unified.serialize(), + dftracer::utils::rocksdb::cf::AGGREGATION); + + // Diagnostic: count aggregation CF keys right after ingest, + // split by shard-prefixed data vs special 0xFF-prefixed + // keys (global config / file markers / tracker). If the + // first bucket is 0 after ingest, the aggregation SSTs + // never actually landed in the CF. + { + std::size_t shard_keys = 0, special_keys = 0; + auto it = db.db()->new_iterator( + dftracer::utils::rocksdb::cf::AGGREGATION); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + auto k = it->key(); + if (k.size() >= 2 && + static_cast(k[0]) < 0xFF) { + shard_keys++; + } else { + special_keys++; + } + } + std::printf( + "[rank 0] AGGREGATION CF after ingest: shard_keys=%zu " + "special_keys=%zu\n", + shard_keys, special_keys); + std::fflush(stdout); + } + } + } catch (const std::exception& e) { + std::fprintf(stderr, "[rank 0] bulk_ingest failed: %s\n", e.what()); + ok = 0; + } + } + MPI_Bcast(&ok, 1, MPI_INT, 0, MPI_COMM_WORLD); + + if (ctx.rank == 0 && !ctx.cli->keep_staging) { + std::error_code ec; + fs::remove_all(ctx.shared_staging_root, ec); + } + // Each rank drops its own node-local staging dir; rank 0's shared + // cleanup above only covers the shared-FS side. + if (!ctx.cli->keep_staging && ctx.shared_staging_root != ctx.staging_root) { + const std::string rank_local = + (fs::path(ctx.staging_root) / ("rank_" + std::to_string(ctx.rank))) + .string(); + std::error_code ec; + fs::remove_all(rank_local, ec); + } + + if (!ok) { + ctx.failed = true; + co_return; + } + ctx.phase_b_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + if (ctx.rank == 0) { + std::printf("[rank 0] Phase B done in %.2f ms (ingest %d ranks)\n", + ctx.phase_b_ms, ctx.size); + std::fflush(stdout); + } + co_return; +} + +// ---- phase_c task: per-rank shard-prefix perfetto write ---- +coro::CoroTask task_phase_c(RunCtx& ctx, CoroScope& scope) { + if (ctx.failed) co_return; + + const auto t0 = std::chrono::steady_clock::now(); + const std::string actual_index_path = + (fs::path(ctx.index_dir) / ".dftindex").string(); + const std::uint16_t shards_total = AGG_KEY_NUM_SHARDS; + const std::uint16_t my_shard_begin = + static_cast(static_cast(shards_total) * + static_cast(ctx.rank) / + static_cast(ctx.size)); + const std::uint16_t my_shard_end = + (ctx.rank + 1 == ctx.size) + ? shards_total + : static_cast( + static_cast(shards_total) * + static_cast(ctx.rank + 1) / + static_cast(ctx.size)); + + if (ctx.rank == 0) { + std::error_code ec; + fs::create_directories(ctx.perfetto_shards_dir, ec); + } + MPI_Barrier(MPI_COMM_WORLD); + char suffix[32]; + std::snprintf(suffix, sizeof(suffix), "/rank_%05d.json.gz", ctx.rank); + ctx.my_shard_output = ctx.perfetto_shards_dir + suffix; + + AggregationConfig phase_c_config; + phase_c_config.time_interval_us = + static_cast(ctx.cli->time_interval * 1000.0); + phase_c_config.compute_statistics = true; + phase_c_config.track_process_parents = true; + phase_c_config.track_default_args = true; + + auto agg_db = + EventAggregator::open_read_only_with_merge_operator(actual_index_path); + composites::dft::aggregators::load_intern_dictionary(*agg_db); + + EventAggregator aggregator(agg_db, 0); + + PerfettoTraceWriterInput input; + input.output_path = ctx.my_shard_output; + input.aggregator = &aggregator; + input.agg_config = &phase_c_config; + auto tracker = aggregator.build_global_tracker(); + input.tracker = tracker.get(); + input.root_pids = tracker->get_root_pids(); + input.owned_tracker = std::move(tracker); + input.compute_statistics = true; + input.compute_percentiles = false; + input.compress = true; + input.compression_level = 6; + input.format = PerfettoEventFormat::COUNTER; + input.merge_on_sharded = true; + input.shard_begin = my_shard_begin; + input.shard_end = my_shard_end; + input.emit_header = (ctx.rank == 0); + input.emit_footer = (ctx.rank == ctx.size - 1); + + PerfettoTraceWriterUtility writer; + const bool ok = co_await scope.spawn(writer, std::move(input)); + + int ok_int = ok ? 1 : 0, global = 0; + MPI_Allreduce(&ok_int, &global, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + if (!global) { + if (ctx.rank == 0) + std::fprintf(stderr, "Phase C failed on some rank\n"); + ctx.failed = true; + co_return; + } + + ctx.phase_c_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + std::printf("[rank %d/%d] Phase C scan+write done in %.2f ms\n", ctx.rank, + ctx.size, ctx.phase_c_ms); + std::fflush(stdout); + co_return; +} + +// ---- merge task: striped parallel pwrite (Lustre/SSD) or sharded-serial ---- +coro::CoroTask task_merge(RunCtx& ctx) { + if (ctx.failed) co_return; + const auto t0 = std::chrono::steady_clock::now(); + + auto layout = fileio::parallel::detect_layout(ctx.final_output); + const bool striped = layout.layout == fileio::parallel::FileLayout::STRIPED; + + std::uint64_t my_sz = 0; + { + std::error_code ec; + my_sz = fs::file_size(ctx.my_shard_output, ec); + if (ec) my_sz = 0; + } + std::vector all_sizes(ctx.size, 0); + MPI_Allgather(&my_sz, 1, MPI_UINT64_T, all_sizes.data(), 1, MPI_UINT64_T, + MPI_COMM_WORLD); + std::uint64_t my_offset = 0, total_bytes = 0; + for (int r = 0; r < ctx.size; ++r) { + if (r < ctx.rank) my_offset += all_sizes[r]; + total_bytes += all_sizes[r]; + } + + int ok = 1; + if (striped) { + if (ctx.rank == 0) { + int fd = ::open(ctx.final_output.c_str(), + O_CREAT | O_WRONLY | O_TRUNC, 0644); + if (fd < 0 || + ::ftruncate(fd, static_cast(total_bytes)) != 0) { + std::fprintf(stderr, "[rank 0] failed to create %s\n", + ctx.final_output.c_str()); + ok = 0; + } + if (fd >= 0) ::close(fd); + } + MPI_Bcast(&ok, 1, MPI_INT, 0, MPI_COMM_WORLD); + if (ok) { + int out_fd = ::open(ctx.final_output.c_str(), O_WRONLY); + int in_fd = ::open(ctx.my_shard_output.c_str(), O_RDONLY); + if (out_fd < 0 || in_fd < 0) { + ok = 0; + } else { + std::vector buf(1 << 20); + off_t out_pos = static_cast(my_offset); + while (true) { + ssize_t n = ::read(in_fd, buf.data(), buf.size()); + if (n == 0) break; + if (n < 0) { + ok = 0; + break; + } + ssize_t w = ::pwrite(out_fd, buf.data(), + static_cast(n), out_pos); + if (w != n) { + ok = 0; + break; + } + out_pos += n; + } + } + if (in_fd >= 0) ::close(in_fd); + if (out_fd >= 0) ::close(out_fd); + } + int global = 1; + MPI_Allreduce(&ok, &global, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + ok = global; + } else if (ctx.rank == 0) { + std::vector shards; + shards.reserve(ctx.size); + for (int r = 0; r < ctx.size; ++r) { + char rs[32]; + std::snprintf(rs, sizeof(rs), "/rank_%05d.json.gz", r); + shards.emplace_back(ctx.perfetto_shards_dir + rs); + } + const int rc = + co_await fileio::parallel::merge_shards(ctx.final_output, shards); + if (rc != 0) ok = 0; + } + + if (!ok) { + if (ctx.rank == 0) std::fprintf(stderr, "merge step failed\n"); + ctx.failed = true; + co_return; + } + + if (ctx.rank == 0) { + const double merge_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + std::printf( + "[rank 0] merge (%s, %llu bytes from %d ranks) -> %s (%.2f ms)\n", + striped ? "parallel-pwrite" : "sharded-serial", + static_cast(total_bytes), ctx.size, + ctx.final_output.c_str(), merge_ms); + std::fflush(stdout); + if (!ctx.cli->keep_staging) { + std::error_code ec; + fs::remove_all(ctx.perfetto_shards_dir, ec); + } + } + co_return; +} + +int run(int argc, char** argv) { + DFTRACER_UTILS_LOGGER_INIT(); + + argparse::ArgumentParser program("dftracer_aggregator_mpi", + DFTRACER_UTILS_PACKAGE_VERSION); + program.add_description( + "MPI driver for the distributed-SST aggregator. Each rank produces " + "per-rank aggregation SSTs; rank 0 bulk-ingests and the ranks jointly " + "write the final gzip JSON output."); + + AggregatorMpiArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + + RunCtx ctx; + ctx.cli = &cli; + MPI_Comm_rank(MPI_COMM_WORLD, &ctx.rank); + MPI_Comm_size(MPI_COMM_WORLD, &ctx.size); + + // Per-node rank count via a node-local sub-communicator. Used to + // divide executor/io threads so N ranks on one node don't each try + // to spin up hardware_concurrency() compute threads (total cores) + // and oversubscribe by N. + MPI_Comm node_comm = MPI_COMM_NULL; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, ctx.rank, + MPI_INFO_NULL, &node_comm); + int ppn = 1; + if (node_comm != MPI_COMM_NULL) { + MPI_Comm_size(node_comm, &ppn); + MPI_Comm_free(&node_comm); + } + if (ppn > 1) { + const auto hw = dftracer_utils_hardware_concurrency(); + const auto scaled = std::max( + 1, static_cast(hw) / static_cast(ppn)); + // Heuristic: the argparse default for these flags is + // hardware_concurrency(). If the user didn't pass the flag, the + // parsed value equals the node-wide default -- scale it down. + // If the user set an explicit value we leave it alone. + if (cli.pipeline.executor_threads == static_cast(hw)) { + cli.pipeline.executor_threads = scaled; + } + if (cli.pipeline.io_threads == static_cast(hw)) { + cli.pipeline.io_threads = scaled; + } + if (ctx.rank == 0) { + std::printf( + "[rank 0] detected ppn=%d, executor_threads=%zu " + "io_threads=%zu (hw=%zu)\n", + ppn, cli.pipeline.executor_threads, cli.pipeline.io_threads, + static_cast(hw)); + std::fflush(stdout); + } + } + + // Deterministic hash-based intern ids so the same string maps to the + // same id on every rank, keeping cross-rank aggregation keys identical. + composites::dft::aggregators::aggregation_intern() + .enable_deterministic_ids(); + + ctx.index_dir = cli.indexing.index_dir; + if (ctx.index_dir.empty()) + ctx.index_dir = fs::absolute(cli.directory.value).string(); + ctx.staging_root = cli.staging_dir.empty() + ? (fs::path(ctx.index_dir) / "_staging").string() + : cli.staging_dir; + ctx.shared_staging_root = (cli.shared_staging_dir.empty() || + cli.shared_staging_dir == ctx.staging_root) + ? ctx.staging_root + : cli.shared_staging_dir; + ctx.final_output = fs::absolute(cli.output).string(); + if (ctx.final_output.size() < 3 || + ctx.final_output.substr(ctx.final_output.size() - 3) != ".gz") { + ctx.final_output += ".gz"; + } + ctx.perfetto_shards_dir = + (fs::path(ctx.index_dir) / "_perfetto_shards").string(); + + std::vector packed_files; + if (ctx.rank == 0) { + const std::string dir = fs::absolute(cli.directory.value).string(); + ctx.all_files = enumerate_inputs(dir); + if (ctx.all_files.empty()) { + std::fprintf(stderr, + "[rank 0] no .pfw/.pfw.gz files in %s, aborting\n", + dir.c_str()); + } + packed_files = pack_paths(ctx.all_files); + std::error_code ec; + fs::create_directories(ctx.staging_root, ec); + if (ctx.shared_staging_root != ctx.staging_root) + fs::create_directories(ctx.shared_staging_root, ec); + } + std::uint64_t packed_size = packed_files.size(); + MPI_Bcast(&packed_size, 1, MPI_UINT64_T, 0, MPI_COMM_WORLD); + if (packed_size == 0) return 1; + if (ctx.rank != 0) packed_files.resize(packed_size); + MPI_Bcast(packed_files.data(), static_cast(packed_size), MPI_CHAR, 0, + MPI_COMM_WORLD); + if (ctx.rank != 0) ctx.all_files = unpack_paths(packed_files); + ctx.nfiles = ctx.all_files.size(); + + // Build the DAG: scan -> phase_a -> phase_b -> phase_c -> merge. + // Each task is scheduled independently; a downstream task only + // starts once its parent finishes. + auto pipeline_config = + cli::build_pipeline_config("DFTracer MPI", cli.pipeline); + Pipeline pipeline(pipeline_config); + + auto scan = make_task( + [&ctx](CoroScope& scope) -> coro::CoroTask { + co_await task_scan(ctx, scope); + }, + "scan"); + auto phase_a = make_task( + [&ctx](CoroScope& scope) -> coro::CoroTask { + co_await task_phase_a(ctx, scope); + }, + "phase_a"); + auto phase_b = make_task( + [&ctx](CoroScope&) -> coro::CoroTask { + co_await task_phase_b(ctx); + }, + "phase_b"); + auto phase_c = make_task( + [&ctx](CoroScope& scope) -> coro::CoroTask { + co_await task_phase_c(ctx, scope); + }, + "phase_c"); + auto merge = make_task( + [&ctx](CoroScope&) -> coro::CoroTask { + co_await task_merge(ctx); + }, + "merge"); + + phase_a->depends_on(scan); + phase_b->depends_on(phase_a); + phase_c->depends_on(phase_b); + merge->depends_on(phase_c); + + pipeline.set_source(scan); + pipeline.set_destination(merge); + pipeline.execute(); + + MPI_Barrier(MPI_COMM_WORLD); + return ctx.failed ? 1 : 0; +} + +} // namespace + +int main(int argc, char** argv) { + int provided = 0; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); + if (provided < MPI_THREAD_FUNNELED) { + std::fprintf(stderr, + "MPI does not support MPI_THREAD_FUNNELED (got %d), " + "aborting\n", + provided); + MPI_Abort(MPI_COMM_WORLD, 1); + } + const int rc = run(argc, argv); + MPI_Finalize(); + return rc; +} diff --git a/src/dftracer/utils/binaries/dftracer_call_tree.cpp b/src/dftracer/utils/binaries/dftracer_call_tree.cpp index 5c595d20..278e89d0 100644 --- a/src/dftracer/utils/binaries/dftracer_call_tree.cpp +++ b/src/dftracer/utils/binaries/dftracer_call_tree.cpp @@ -1,427 +1,538 @@ -/** - * DFTracer Call Tree Utility - * Standalone binary for building and analyzing call trees from DFTracer trace - * files - */ - -#include +// Pipeline-driven call_tree binary. +// +// DAG: +// scan -> build -> merge -> hierarchy -> write_json +// +// scan : enumerate inputs +// build : per-file CoroScope fan-out; each file ingests into its own +// local CallTree fragment (no shared mutation) +// merge : concatenate fragments into ctx.merged +// hierarchy : per-process CoroScope fan-out; each ProcessCallTree is +// independent so parent-child build runs in parallel +// write_json: per-worker serialization of process slices, ParallelWriter +// feeds io_backend for the actual writes + +#include +#include +#include +#include +#include +#include #include #include #include +#include +#include +#include +#include +#include +#include +#include #include -#include +#include #include #include -#include -#include +#include +#include #include #include +#include "common_cli.h" + +using namespace dftracer::utils; +using namespace dftracer::utils::utilities; using namespace dftracer::utils::call_tree; -/** - * Collect trace files from directory or file list - */ -static std::vector collect_trace_files( - const std::vector& inputs, bool recursive) { - std::vector trace_files; +namespace { - for (const auto& input : inputs) { - if (fs::is_directory(input)) { - if (recursive) { - for (const auto& entry : - fs::recursive_directory_iterator(input)) { - if (entry.is_regular_file()) { - std::string path = entry.path().string(); - if ((path.size() >= 4 && - path.substr(path.size() - 4) == ".pfw") || - (path.size() >= 7 && - path.substr(path.size() - 7) == ".pfw.gz")) { - trace_files.push_back(path); - } - } - } - } else { - for (const auto& entry : fs::directory_iterator(input)) { - if (entry.is_regular_file()) { - std::string path = entry.path().string(); - if ((path.size() >= 4 && - path.substr(path.size() - 4) == ".pfw") || - (path.size() >= 7 && - path.substr(path.size() - 7) == ".pfw.gz")) { - trace_files.push_back(path); - } - } - } - } - } else if (fs::is_regular_file(input)) { - trace_files.push_back(input); - } else { - DFTRACER_UTILS_LOG_ERROR("Input not found or not accessible: %s", - input.c_str()); - } - } +class CallTreeArgParse : public cli::ArgParse { + public: + cli::PipelineArgs pipeline; - return trace_files; -} + std::vector inputs; + bool recursive = false; + std::string output; + bool verbose = false; + bool no_save = false; + bool gzip = false; -/** - * Analyze call patterns in the tree - */ -static void analyze_call_patterns(const std::vector& nodes) { - printf("\n--- Call Pattern Analysis ---\n"); - - if (nodes.empty()) { - printf("No nodes to analyze\n"); - return; + explicit CallTreeArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + schema(pipeline); } - // Find most frequently called functions - std::map call_counts; - for (const auto& node : nodes) { - call_counts[node.name]++; + protected: + void register_args() override { + parser() + .add_argument("inputs") + .help("Trace files (.pfw, .pfw.gz) or directories") + .nargs(argparse::nargs_pattern::at_least_one); + parser().add_argument("-r", "--recursive").flag(); + parser() + .add_argument("-o", "--output") + .help("Output JSON path (Chrome Tracing)") + .default_value(""); + parser().add_argument("-v", "--verbose").flag(); + parser().add_argument("--no-save").flag(); + parser() + .add_argument("--gzip") + .help("gzip the output (.gz appended if needed)") + .flag(); } - // Sort by frequency - std::vector> sorted_calls( - call_counts.begin(), call_counts.end()); - std::sort(sorted_calls.begin(), sorted_calls.end(), - [](const auto& a, const auto& b) { return a.second > b.second; }); - - printf("Top 10 most frequently called functions:\n"); - for (size_t i = 0; i < std::min(sorted_calls.size(), size_t(10)); i++) { - printf(" %2zu. %-30s : %zu calls\n", i + 1, - sorted_calls[i].first.c_str(), sorted_calls[i].second); + void post_parse() override { + inputs = parser().get>("inputs"); + recursive = parser().get("--recursive"); + output = parser().get("--output"); + verbose = parser().get("--verbose"); + no_save = parser().get("--no-save"); + gzip = parser().get("--gzip"); } +}; + +bool is_trace_file(const std::string& path) { + return (path.size() >= 4 && + path.compare(path.size() - 4, 4, ".pfw") == 0) || + (path.size() >= 7 && + path.compare(path.size() - 7, 7, ".pfw.gz") == 0); } -/** - * Analyze timing statistics - */ -static void analyze_timing(const std::vector& nodes) { - printf("\n--- Timing Analysis ---\n"); +struct RunCtx { + const CallTreeArgParse* cli = nullptr; - if (nodes.empty()) { - printf("No nodes to analyze\n"); - return; + std::vector trace_files; + std::vector> per_file; + internal::CallTree merged; + std::vector process_keys; + + std::string output_path; + bool failed = false; + + double scan_ms = 0; + double build_ms = 0; + double merge_ms = 0; + double hier_ms = 0; + double write_ms = 0; +}; + +coro::CoroTask task_scan(RunCtx* ctx) { + const auto t0 = std::chrono::steady_clock::now(); + for (const auto& in : ctx->cli->inputs) { + std::error_code ec; + if (fs::is_directory(in, ec)) { + if (ctx->cli->recursive) { + for (const auto& e : fs::recursive_directory_iterator(in, ec)) { + if (e.is_regular_file(ec) && + is_trace_file(e.path().string())) + ctx->trace_files.push_back(e.path().string()); + } + } else { + for (const auto& e : fs::directory_iterator(in, ec)) { + if (e.is_regular_file(ec) && + is_trace_file(e.path().string())) + ctx->trace_files.push_back(e.path().string()); + } + } + } else if (fs::is_regular_file(in, ec)) { + ctx->trace_files.push_back(in); + } } - - // Calculate timing statistics - std::vector durations; - durations.reserve(nodes.size()); - - for (const auto& node : nodes) { - durations.push_back(node.duration_us); + std::sort(ctx->trace_files.begin(), ctx->trace_files.end()); + if (ctx->trace_files.empty()) { + DFTRACER_UTILS_LOG_ERROR("%s", "no trace files found"); + ctx->failed = true; } - - std::sort(durations.begin(), durations.end()); - - std::uint64_t total = 0; - for (auto d : durations) { - total += d; + ctx->scan_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + if (ctx->cli->verbose && !ctx->failed) { + std::printf("[scan] %.2f ms: %zu files\n", ctx->scan_ms, + ctx->trace_files.size()); + std::fflush(stdout); } - double avg = - static_cast(total) / static_cast(durations.size()); - - std::uint64_t min_time = durations.front(); - std::uint64_t max_time = durations.back(); - std::uint64_t median = durations[durations.size() / 2]; - std::uint64_t p95 = durations[static_cast( - static_cast(durations.size()) * 0.95)]; - std::uint64_t p99 = durations[static_cast( - static_cast(durations.size()) * 0.99)]; - - printf("Duration statistics (milliseconds):\n"); - printf(" Min: %.3f ms\n", static_cast(min_time) / 1000.0); - printf(" Max: %.3f ms\n", static_cast(max_time) / 1000.0); - printf(" Mean: %.3f ms\n", avg / 1000.0); - printf(" Median: %.3f ms\n", static_cast(median) / 1000.0); - printf(" 95th: %.3f ms\n", static_cast(p95) / 1000.0); - printf(" 99th: %.3f ms\n", static_cast(p99) / 1000.0); + co_return; } -/** - * Find critical path (longest duration calls) - */ -static void find_critical_path(const std::vector& nodes) { - printf("\n--- Critical Path (Longest Duration Calls) ---\n"); - - if (nodes.empty()) { - printf("No nodes to analyze\n"); - return; - } +coro::CoroTask ingest_one_file(std::string path, internal::CallTree* tree, + std::atomic* total) { + auto counts = co_await internal::read_trace_file_async(std::move(path), + tree, nullptr); + total->fetch_add(counts.processed, std::memory_order_relaxed); +} - // Find top 10 longest running calls - std::vector sorted_nodes = nodes; - std::sort(sorted_nodes.begin(), sorted_nodes.end(), - [](const auto& a, const auto& b) { - return a.duration_us > b.duration_us; - }); - - printf("Top 10 longest running calls:\n"); - for (size_t i = 0; i < std::min(sorted_nodes.size(), size_t(10)); i++) { - const auto& node = sorted_nodes[i]; - printf(" %2zu. %-30s [%-15s] - %10.3f ms (level %d)\n", i + 1, - node.name.c_str(), node.category.c_str(), - static_cast(node.duration_us) / 1000.0, node.level); +coro::CoroTask ingest_all_files( + CoroScope* child, const std::vector* paths, + const std::vector>* per_file, + std::atomic* total) { + for (std::size_t i = 0; i < paths->size(); ++i) { + std::string path = (*paths)[i]; + internal::CallTree* tree = (*per_file)[i].get(); + child->spawn([path = std::move(path), tree, + total](CoroScope&) mutable -> coro::CoroTask { + co_await ingest_one_file(std::move(path), tree, total); + }); } + co_return; } -/** - * Analyze by category - */ -static void analyze_by_category(const std::vector& nodes) { - printf("\n--- Analysis by Category ---\n"); +coro::CoroTask task_build(RunCtx* ctx, CoroScope* scope) { + if (ctx->failed) co_return; + const auto t0 = std::chrono::steady_clock::now(); - if (nodes.empty()) { - printf("No nodes to analyze\n"); - return; + const std::size_t n = ctx->trace_files.size(); + ctx->per_file.clear(); + ctx->per_file.reserve(n); + for (std::size_t i = 0; i < n; ++i) { + ctx->per_file.push_back(std::make_unique()); + ctx->per_file.back()->initialize(); } - std::map category_counts; - std::map category_durations; - - for (const auto& node : nodes) { - category_counts[node.category]++; - category_durations[node.category] += node.duration_us; + std::atomic total_events{0}; + std::atomic* total_ptr = &total_events; + + const std::vector* paths_ptr = &ctx->trace_files; + const std::vector>* per_file_ptr = + &ctx->per_file; + + co_await scope->scope( + [paths_ptr, per_file_ptr, + total_ptr](CoroScope& child) mutable -> coro::CoroTask { + co_await ingest_all_files(&child, paths_ptr, per_file_ptr, + total_ptr); + }); + + ctx->build_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + if (ctx->cli->verbose) { + std::printf("[build] %.2f ms: %zu events across %zu files\n", + ctx->build_ms, total_events.load(), n); + std::fflush(stdout); } + co_return; +} - printf("Nodes by category:\n"); - for (const auto& [category, count] : category_counts) { - double avg_duration = - static_cast(category_durations[category]) / - static_cast(count) / 1000.0; - printf(" %-20s: %6zu nodes, avg duration: %.3f ms\n", category.c_str(), - count, avg_duration); +coro::CoroTask task_merge(RunCtx* ctx) { + if (ctx->failed) co_return; + const auto t0 = std::chrono::steady_clock::now(); + ctx->merged.initialize(); + for (auto& t : ctx->per_file) { + if (t) ctx->merged.merge_from(std::move(*t)); + } + ctx->per_file.clear(); + ctx->process_keys = ctx->merged.keys(); + ctx->merge_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + if (ctx->cli->verbose) { + std::printf("[merge] %.2f ms: %zu processes\n", ctx->merge_ms, + ctx->process_keys.size()); + std::fflush(stdout); } + co_return; } -int main(int argc, char** argv) { - DFTRACER_UTILS_LOGGER_INIT(); +coro::CoroTask hier_one_process(internal::CallTree* tree, + internal::ProcessKey key) { + tree->build_hierarchy_for_process(key); + co_return; +} - argparse::ArgumentParser program("dftracer_call_tree", - DFTRACER_UTILS_PACKAGE_VERSION); - program.add_description( - "DFTracer Call Tree utility - builds and analyzes call trees from " - "DFTracer trace files"); - - // Input files/directories - program.add_argument("inputs") - .help( - "Trace files (.pfw, .pfw.gz) or directories containing trace files") - .nargs(argparse::nargs_pattern::at_least_one); - - // Processing options - program.add_argument("-r", "--recursive") - .help("Recursively search directories for trace files") - .flag(); - - program.add_argument("--pattern") - .help("File pattern for trace files (default: *.pfw.gz)") - .default_value(std::string("*.pfw.gz")); - - // Output options - program.add_argument("-o", "--output") - .help( - "Output file path for serialized call tree (default: " - "auto-generated from input)") - .default_value(std::string("")); - - program.add_argument("--json") - .help("Also save call tree in JSON (Chrome Tracing) format") - .flag(); - - program.add_argument("--text") - .help("Export call tree to text file") - .default_value(std::string("")); - - // Analysis options - program.add_argument("--max-depth") - .help("Maximum depth for tree printing (0=unlimited)") - .default_value(0) - .scan<'i', int>(); - - program.add_argument("--analyze") - .help( - "Perform detailed analysis (call patterns, timing, critical path)") - .flag(); - - program.add_argument("-v", "--verbose") - .help("Enable verbose output") - .flag(); - - program.add_argument("--stats-only") - .help("Only print statistics, skip tree traversal") - .flag(); - - program.add_argument("--no-save") - .help("Don't save output files, only print analysis") - .flag(); - - // Parse arguments - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - std::cerr << err.what() << std::endl; - std::cerr << program; - return 1; +coro::CoroTask hier_all_processes( + CoroScope* child, internal::CallTree* tree, + const std::vector* keys) { + for (auto k : *keys) { + child->spawn([tree, k](CoroScope&) mutable -> coro::CoroTask { + co_await hier_one_process(tree, k); + }); } + co_return; +} - // Get arguments - auto inputs = program.get>("inputs"); - bool recursive = program.get("--recursive"); - std::string pattern = program.get("--pattern"); - std::string output_path = program.get("--output"); - bool save_json = program.get("--json"); - std::string text_file = program.get("--text"); - int max_depth = program.get("--max-depth"); - bool analyze = program.get("--analyze"); - bool verbose = program.get("--verbose"); - bool stats_only = program.get("--stats-only"); - bool no_save = program.get("--no-save"); - - // Collect trace files - printf("=== DFTracer Call Tree Builder ===\n\n"); - - auto start_time = std::chrono::high_resolution_clock::now(); - - // For single directory input, use load_from_directory - // For multiple inputs or files, collect manually - CallTree tree; - bool loaded = false; - - if (inputs.size() == 1 && fs::is_directory(inputs[0])) { - printf("Loading traces from directory: %s\n", inputs[0].c_str()); - if (verbose) { - printf(" Pattern: %s\n", pattern.c_str()); - printf(" Recursive: %s\n", recursive ? "yes" : "no"); - } - - loaded = tree.load_from_directory(inputs[0], pattern); - if (!loaded) { - fprintf(stderr, "Failed to load traces from directory: %s\n", - inputs[0].c_str()); - return 1; - } - } else { - auto trace_files = collect_trace_files(inputs, recursive); - if (trace_files.empty()) { - fprintf(stderr, "No trace files found in the specified inputs.\n"); - return 1; - } +coro::CoroTask task_hierarchy(RunCtx* ctx, CoroScope* scope) { + if (ctx->failed) co_return; + const auto t0 = std::chrono::steady_clock::now(); + + internal::CallTree* tree = &ctx->merged; + const std::vector* keys_ptr = &ctx->process_keys; + co_await scope->scope( + [tree, keys_ptr](CoroScope& child) mutable -> coro::CoroTask { + co_await hier_all_processes(&child, tree, keys_ptr); + }); + + ctx->hier_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + if (ctx->cli->verbose) { + std::printf("[hierarchy] %.2f ms\n", ctx->hier_ms); + std::fflush(stdout); + } + co_return; +} - printf("Found %zu trace file(s) to process:\n", trace_files.size()); - if (verbose) { - for (const auto& file : trace_files) { - printf(" %s\n", file.c_str()); +// Serialize all events for a single process slice into `out`. Each event is +// followed by ",\n"; trim the final separator at concatenation time. +void serialize_process_slice(const internal::ProcessCallTree& pgraph, + const internal::ProcessKey& key, + internal::JsonSerializer& serializer, + std::size_t starting_index, std::string& out) { + static constexpr std::size_t EVT_BUF = 16384; + char buffer[EVT_BUF]; + std::size_t event_idx = starting_index; + for (std::uint64_t root_id : pgraph.root_calls) { + std::vector stack; + stack.push_back(root_id); + while (!stack.empty()) { + std::uint64_t node_id = stack.back(); + stack.pop_back(); + auto it = pgraph.calls.find(node_id); + if (it == pgraph.calls.end()) continue; + const auto& node = it->second; + std::size_t written = serializer.serialize_node( + buffer, static_cast(event_idx++), *node, key.pid, key.tid); + // serialize_node returns size including trailing newline; strip it + // and add ",\n" so concatenation produces valid + // JSON-array-of-lines. + if (written > 0) { + out.append(buffer, written - 1); + out.append(",\n", 2); + } + const auto& children = node->get_children(); + for (auto cit = children.rbegin(); cit != children.rend(); ++cit) { + stack.push_back(*cit); } } - - // Load first directory for now (CallTree API expects directory) - // This is a limitation of the current API - fprintf(stderr, - "Note: Multi-file input not yet supported. Use directory input " - "instead.\n"); - return 1; } +} - printf("Loaded %zu trace files\n", tree.get_num_trace_files()); - printf("\n"); - - // Generate call tree - printf("Generating call tree structure...\n"); - if (!tree.generate()) { - fprintf(stderr, "Failed to generate call tree\n"); - return 1; +coro::CoroTask serialize_slice(const internal::CallTree* merged, + internal::ProcessKey key, + const std::string* hostname_hash, + std::vector* slice_buffers, + std::size_t index, + std::uint64_t starting_index) { + auto* pgraph = const_cast(merged)->get(key); + if (pgraph) { + internal::JsonSerializer serializer; + char init[8]; + serializer.initialize(init, *hostname_hash); + (void)init; + serialize_process_slice(*pgraph, key, serializer, starting_index, + (*slice_buffers)[index]); } - printf("Call tree generation complete\n\n"); - - auto gen_time = std::chrono::high_resolution_clock::now(); - auto gen_duration = std::chrono::duration_cast( - gen_time - start_time); - if (verbose) { - printf("Generation time: %lld ms\n\n", - static_cast(gen_duration.count())); + co_return; +} + +coro::CoroTask serialize_all_slices( + CoroScope* child, const internal::CallTree* merged, + const std::vector* keys, + const std::string* hostname_hash, std::vector* slice_buffers, + std::uint64_t stride) { + for (std::size_t i = 0; i < keys->size(); ++i) { + internal::ProcessKey k = (*keys)[i]; + std::uint64_t start_idx = i * stride; + child->spawn([merged, k, start_idx, i, hostname_hash, slice_buffers]( + CoroScope&) mutable -> coro::CoroTask { + co_await serialize_slice(merged, k, hostname_hash, slice_buffers, i, + start_idx); + }); } + co_return; +} - // Print statistics - printf("=== Call Tree Statistics ===\n"); - tree.print_statistics(); +coro::CoroTask task_write_json(RunCtx* ctx, CoroScope* scope) { + if (ctx->failed || ctx->cli->no_save) co_return; + const auto t0 = std::chrono::steady_clock::now(); + + const std::size_t n = ctx->process_keys.size(); + std::vector slice_buffers(n); + static constexpr std::uint64_t IDX_STRIDE = 1ull << 20; + + char hostname[256] = {}; + gethostname(hostname, sizeof(hostname) - 1); + std::string hostname_hash(hostname); + + std::vector* slice_buffers_ptr = &slice_buffers; + const std::string* hostname_hash_ptr = &hostname_hash; + const internal::CallTree* merged = &ctx->merged; + const std::vector* keys_ptr = &ctx->process_keys; + + co_await scope->scope( + [merged, keys_ptr, hostname_hash_ptr, + slice_buffers_ptr](CoroScope& child) mutable -> coro::CoroTask { + co_await serialize_all_slices(&child, merged, keys_ptr, + hostname_hash_ptr, slice_buffers_ptr, + IDX_STRIDE); + }); + + std::string header; + header.append("[\n", 2); + { + internal::JsonSerializer serializer; + char init_buf[8]; + serializer.initialize(init_buf, hostname_hash); + (void)init_buf; + char buf[8192]; + std::time_t now = std::time(nullptr); + char ts[64]; + std::strftime(ts, sizeof(ts), "%Y-%m-%d %H:%M:%S", + std::localtime(&now)); + std::size_t w = serializer.serialize_metadata(buf, "timestamp", ts, "M", + 0, 0, true); + if (w > 0) header.append(buf, w - 1); + header.append(",\n", 2); + w = serializer.serialize_metadata(buf, "format", "call_tree", "M", 0, 0, + true); + if (w > 0) header.append(buf, w - 1); + header.append(",\n", 2); + } - // Print tree structure - if (!stats_only) { - printf("\n=== Call Tree Structure ===\n"); - tree.print_depth_first(max_depth); + fileio::parallel::WriterConfig wc; + wc.layout = fileio::parallel::FileLayout::SHARDED; + wc.gzip = ctx->cli->gzip; + auto writer = fileio::parallel::make_writer(wc); + + const std::size_t total_workers = n + 1; + if (co_await writer->open(ctx->output_path, total_workers, ctx->cli->gzip, + scope) != 0) { + DFTRACER_UTILS_LOG_ERROR("failed to open writer: %s", + ctx->output_path.c_str()); + ctx->failed = true; + co_return; } - // Perform detailed analysis if requested - if (analyze) { - printf("\n=== Detailed Analysis ===\n"); - auto nodes = tree.get_nodes_depth_first(); - printf("Retrieved %zu nodes for analysis\n", nodes.size()); + if (co_await writer->write_chunk( + 0, ByteView(header.data(), header.size())) != 0) { + ctx->failed = true; + } - analyze_call_patterns(nodes); - analyze_timing(nodes); - find_critical_path(nodes); - analyze_by_category(nodes); + for (std::size_t i = 0; i < n && !ctx->failed; ++i) { + std::string& b = slice_buffers[i]; + if (i + 1 == n) { + if (b.size() >= 2 && b[b.size() - 2] == ',' && + b[b.size() - 1] == '\n') { + b.resize(b.size() - 2); + b.append("\n]\n", 3); + } else { + b.append("]\n", 2); + } + } + if (co_await writer->write_chunk(i + 1, ByteView(b.data(), b.size())) != + 0) { + ctx->failed = true; + break; + } } - // Save outputs - if (!no_save) { - printf("\n=== Saving Outputs ===\n"); + if (co_await writer->close() != 0) ctx->failed = true; - // Set custom output path if specified - if (!output_path.empty()) { - tree.set_output_path(output_path); + if (!ctx->failed) { + auto shards = writer->output_paths(); + if (co_await fileio::parallel::merge_shards(ctx->output_path, shards) != + 0) { + DFTRACER_UTILS_LOG_ERROR("merge_shards failed for %s", + ctx->output_path.c_str()); + ctx->failed = true; } + } - // Save binary format - std::string bin_file = tree.get_output_path(); - printf("Saving binary call tree to: %s\n", bin_file.c_str()); - if (tree.save_to_file()) { - printf(" Successfully saved!\n"); - } else { - fprintf(stderr, " Failed to save binary file\n"); - } + ctx->write_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); + if (ctx->cli->verbose) { + std::printf("[write] %.2f ms -> %s\n", ctx->write_ms, + ctx->output_path.c_str()); + std::fflush(stdout); + } + co_return; +} - // Save JSON format if requested - if (save_json) { - std::string json_file = bin_file; - // Replace .calltree extension with .pfw - if (json_file.size() >= 9 && - json_file.substr(json_file.size() - 9) == ".calltree") { - json_file = json_file.substr(0, json_file.size() - 9) + ".pfw"; - } else { - json_file += ".pfw"; - } +int run(int argc, char** argv) { + DFTRACER_UTILS_LOGGER_INIT(); - printf("Saving JSON call tree to: %s\n", json_file.c_str()); - if (tree.save_to_json(json_file)) { - printf(" Successfully saved! (Chrome Tracing compatible)\n"); - } else { - fprintf(stderr, " Failed to save JSON file\n"); - } + argparse::ArgumentParser program("dftracer_call_tree", + DFTRACER_UTILS_PACKAGE_VERSION); + program.add_description( + "Build a call tree from DFTracer trace files and emit Chrome Tracing " + "JSON."); + + CallTreeArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + + RunCtx ctx; + ctx.cli = &cli; + + if (cli.output.empty()) { + std::string base = "call_tree"; + if (!cli.inputs.empty()) { + fs::path p(cli.inputs.front()); + if (fs::is_directory(p)) + base = p.filename().string(); + else + base = p.stem().string(); + if (base.empty()) base = "call_tree"; } + ctx.output_path = base + ".pfw"; + } else { + ctx.output_path = cli.output; + } + if (cli.gzip && + (ctx.output_path.size() < 3 || + ctx.output_path.compare(ctx.output_path.size() - 3, 3, ".gz") != 0)) { + ctx.output_path += ".gz"; + } - // Save text format if requested - if (!text_file.empty()) { - printf("Exporting call tree to text file: %s\n", text_file.c_str()); - if (tree.print_depth_first_to_file(text_file, max_depth)) { - printf(" Successfully exported!\n"); - } else { - fprintf(stderr, " Failed to export text file\n"); - } - } + auto pipeline_config = + cli::build_pipeline_config("DFTracer CallTree", cli.pipeline); + Pipeline pipeline(pipeline_config); + + RunCtx* ctx_ptr = &ctx; + auto scan = make_task( + [ctx_ptr](CoroScope&) -> coro::CoroTask { + co_await task_scan(ctx_ptr); + }, + "scan"); + auto build = make_task( + [ctx_ptr](CoroScope& scope) -> coro::CoroTask { + co_await task_build(ctx_ptr, &scope); + }, + "build"); + auto merge = make_task( + [ctx_ptr](CoroScope&) -> coro::CoroTask { + co_await task_merge(ctx_ptr); + }, + "merge"); + auto hierarchy = make_task( + [ctx_ptr](CoroScope& scope) -> coro::CoroTask { + co_await task_hierarchy(ctx_ptr, &scope); + }, + "hierarchy"); + auto write = make_task( + [ctx_ptr](CoroScope& scope) -> coro::CoroTask { + co_await task_write_json(ctx_ptr, &scope); + }, + "write_json"); + + build->depends_on(scan); + merge->depends_on(build); + hierarchy->depends_on(merge); + write->depends_on(hierarchy); + + pipeline.set_source(scan); + pipeline.set_destination(write); + pipeline.execute(); + + if (cli.verbose && !ctx.failed) { + std::printf( + "[done] scan=%.1fms build=%.1fms merge=%.1fms hierarchy=%.1fms " + "write=%.1fms\n", + ctx.scan_ms, ctx.build_ms, ctx.merge_ms, ctx.hier_ms, ctx.write_ms); } - auto end_time = std::chrono::high_resolution_clock::now(); - auto total_duration = std::chrono::duration_cast( - end_time - start_time); + return ctx.failed ? 1 : 0; +} - printf("\n=== Completed ===\n"); - printf("Total execution time: %lld ms\n", - static_cast(total_duration.count())); +} // namespace - return 0; -} +int main(int argc, char** argv) { return run(argc, argv); } diff --git a/src/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp b/src/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp new file mode 100644 index 00000000..07e926d2 --- /dev/null +++ b/src/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp @@ -0,0 +1,208 @@ +// MPI driver for parallel call-tree construction. Thin DAG over +// MPICallTreeBuilder; all phase logic lives in the engine. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common_cli.h" + +using namespace dftracer::utils; +using namespace dftracer::utils::call_tree; + +namespace { + +class CallTreeMpiArgParse : public cli::ArgParse { + public: + cli::PipelineArgs pipeline; + + std::string input_dir; + std::string output; + std::string staging_dir; + bool verbose = false; + bool gzip = false; + bool keep_staging = false; + + explicit CallTreeMpiArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + schema(pipeline); + } + + protected: + void register_args() override { + parser().add_argument("input").help( + "Input directory containing trace files"); + parser() + .add_argument("-o", "--output") + .help("Output JSON path") + .default_value("call_tree.pfw"); + parser() + .add_argument("--staging-dir") + .help( + "Shared FS staging root for per-rank shards (default " + ".shards/)") + .default_value(""); + parser().add_argument("--gzip").flag(); + parser().add_argument("-v", "--verbose").flag(); + parser().add_argument("--keep-staging").flag(); + } + + void post_parse() override { + input_dir = parser().get("input"); + output = parser().get("--output"); + staging_dir = parser().get("--staging-dir"); + gzip = parser().get("--gzip"); + verbose = parser().get("--verbose"); + keep_staging = parser().get("--keep-staging"); + } +}; + +struct RunCtx { + const CallTreeMpiArgParse* cli = nullptr; + std::unique_ptr builder; + std::string final_output; + std::string staging_dir; + bool failed = false; +}; + +int run(int argc, char** argv) { + DFTRACER_UTILS_LOGGER_INIT(); + + argparse::ArgumentParser program("dftracer_call_tree_mpi", + DFTRACER_UTILS_PACKAGE_VERSION); + program.add_description( + "MPI driver for parallel call-tree construction. Each rank owns a " + "slice of PIDs and emits a Chrome Tracing JSON shard; rank 0 merges."); + + CallTreeMpiArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + + int rank = 0, size = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // Scale per-rank threads down when multiple ranks share a node. + MPI_Comm node_comm = MPI_COMM_NULL; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, + MPI_INFO_NULL, &node_comm); + int ppn = 1; + if (node_comm != MPI_COMM_NULL) { + MPI_Comm_size(node_comm, &ppn); + MPI_Comm_free(&node_comm); + } + if (ppn > 1) { + const auto hw = dftracer_utils_hardware_concurrency(); + const auto scaled = std::max( + 1, static_cast(hw) / static_cast(ppn)); + if (cli.pipeline.executor_threads == static_cast(hw)) + cli.pipeline.executor_threads = scaled; + if (cli.pipeline.io_threads == static_cast(hw)) + cli.pipeline.io_threads = scaled; + } + + RunCtx ctx; + ctx.cli = &cli; + + MPICallTreeConfig builder_cfg; + builder_cfg.verbose = cli.verbose; + ctx.builder = std::make_unique(builder_cfg); + + ctx.final_output = fs::absolute(cli.output).string(); + if (cli.gzip && (ctx.final_output.size() < 3 || + ctx.final_output.compare(ctx.final_output.size() - 3, 3, + ".gz") != 0)) { + ctx.final_output += ".gz"; + } + ctx.staging_dir = cli.staging_dir.empty() + ? (ctx.final_output + ".shards") + : fs::absolute(cli.staging_dir).string(); + + auto pipeline_config = + cli::build_pipeline_config("DFTracer CallTree MPI", cli.pipeline); + Pipeline pipeline(pipeline_config); + + RunCtx* p = &ctx; + auto discover = make_task( + [p](CoroScope& scope) -> coro::CoroTask { + if (p->failed) co_return; + p->builder->add_trace_directory(p->cli->input_dir); + if (p->builder->trace_files().empty()) { + if (p->builder->rank() == 0) + std::fprintf(stderr, "no .pfw/.pfw.gz files in %s\n", + p->cli->input_dir.c_str()); + p->failed = true; + co_return; + } + if (!co_await p->builder->discover_pids(&scope)) p->failed = true; + }, + "discover"); + auto build = make_task( + [p](CoroScope& scope) -> coro::CoroTask { + if (p->failed) co_return; + if (!co_await p->builder->build(&scope)) p->failed = true; + }, + "build"); + auto hierarchy = make_task( + [p](CoroScope& scope) -> coro::CoroTask { + if (p->failed) co_return; + if (!co_await p->builder->hierarchy(&scope)) p->failed = true; + }, + "hierarchy"); + auto write = make_task( + [p](CoroScope& scope) -> coro::CoroTask { + if (p->failed) co_return; + if (!co_await p->builder->write(&scope, p->final_output, + p->staging_dir, p->cli->gzip)) + p->failed = true; + }, + "write"); + auto merge = make_task( + [p](CoroScope&) -> coro::CoroTask { + if (p->failed) co_return; + if (!co_await p->builder->merge(p->final_output, p->staging_dir, + p->cli->gzip, p->cli->keep_staging)) + p->failed = true; + }, + "merge"); + + build->depends_on(discover); + hierarchy->depends_on(build); + write->depends_on(hierarchy); + merge->depends_on(write); + + pipeline.set_source(discover); + pipeline.set_destination(merge); + pipeline.execute(); + + MPI_Barrier(MPI_COMM_WORLD); + return ctx.failed ? 1 : 0; +} + +} // namespace + +int main(int argc, char** argv) { + int provided = 0; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); + if (provided < MPI_THREAD_FUNNELED) { + std::fprintf(stderr, + "MPI does not support MPI_THREAD_FUNNELED (got %d), " + "aborting\n", + provided); + MPI_Abort(MPI_COMM_WORLD, 1); + } + const int rc = run(argc, argv); + MPI_Finalize(); + return rc; +} diff --git a/src/dftracer/utils/binaries/dftracer_comparator.cpp b/src/dftracer/utils/binaries/dftracer_comparator.cpp index a9cd9750..ebee6bb2 100644 --- a/src/dftracer/utils/binaries/dftracer_comparator.cpp +++ b/src/dftracer/utils/binaries/dftracer_comparator.cpp @@ -1,11 +1,7 @@ #include -#include -#include #include #include -#include #include -#include #include #include #include @@ -14,25 +10,121 @@ #include #include #include +#include #include #include -#include #include #include #include -#include #include #include #include -#include -#include -#include + +#include "common_cli.h" using namespace dftracer::utils; using namespace dftracer::utils::utilities; using namespace dftracer::utils::utilities::composites::dft::aggregators; using namespace dftracer::utils::utilities::composites::dft::comparator; +using dftracer::utils::utilities::composites::dft::indexing:: + IndexResolverUtility; +using dftracer::utils::utilities::composites::dft::indexing::ResolverInput; +using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBuildBatchConfig; + +class ComparatorArgParse : public cli::ArgParse { + public: + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + cli::QueryArgs query_args{"Query filter (default: all events)"}; + + std::string config_path; + std::string baseline; + std::string variant; + std::string baseline_index_dir; + std::string variant_index_dir; + std::string group_by; + std::string format = "table"; + double time_interval = 5000.0; + double threshold = 0.0; + bool no_color = false; + + explicit ComparatorArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.with_index_dir = false; + indexing.force_help = "Force index rebuild"; + schema(pipeline, indexing, query_args); + } + + protected: + void register_args() override { + parser() + .add_argument("--config") + .help("JSON config file for hierarchical comparison") + .default_value(""); + + parser() + .add_argument("--baseline") + .help("Baseline trace file or directory") + .default_value(""); + + parser() + .add_argument("--variant") + .help("Variant trace file or directory") + .default_value(""); + + parser() + .add_argument("--baseline-index-dir") + .help( + "Index directory for baseline (default: co-located with data)") + .default_value(""); + + parser() + .add_argument("--variant-index-dir") + .help("Index directory for variant (default: co-located with data)") + .default_value(""); + + parser() + .add_argument("--group-by") + .help("Comma-separated group keys (default: cat,name)") + .default_value(""); + + parser() + .add_argument("--format") + .help("Output format: table (default) or json") + .default_value("table"); + + parser() + .add_argument("-t", "--time-interval") + .help("Time interval in milliseconds for bucketing (default: 5000)") + .scan<'g', double>() + .default_value(5000.0); + + parser() + .add_argument("--threshold") + .help("Hide changes below this percentage") + .scan<'g', double>() + .default_value(0.0); + + parser() + .add_argument("--no-color") + .help("Disable ANSI color output") + .flag(); + } + + void post_parse() override { + config_path = parser().get("--config"); + baseline = parser().get("--baseline"); + variant = parser().get("--variant"); + baseline_index_dir = parser().get("--baseline-index-dir"); + variant_index_dir = parser().get("--variant-index-dir"); + group_by = parser().get("--group-by"); + format = parser().get("--format"); + time_interval = parser().get("--time-interval"); + threshold = parser().get("--threshold"); + no_color = parser().get("--no-color"); + } +}; namespace { @@ -44,156 +136,166 @@ void flatten_nodes(const ComparisonNode& node, } } -// Run one complete aggregation pipeline for a set of files. -// Returns EventAggregatorUtilityOutput after the pipeline completes. -static coro::CoroTask run_aggregation( - const std::vector& input_files, - const AggregationConfig& agg_config, - const std::optional& query, - const std::string& index_dir, std::size_t checkpoint_size, - bool force_rebuild, std::size_t executor_threads) { +static coro::CoroTask process_file_task( + std::string file_path, coro::ChannelProducer ch, + std::string index_dir, std::size_t checkpoint_size, bool force_rebuild, + AggregationConfig agg_config, std::optional query, + std::atomic* global_chunk_idx_ptr) { constexpr std::size_t CHUNK_SIZE_MB = 4; constexpr std::size_t BATCH_SIZE_MB = 4; - auto pipeline_config = PipelineConfig() - .with_name("DFTracer Comparator Aggregation") - .with_compute_threads(executor_threads) - .with_watchdog(false); - Pipeline pipeline(pipeline_config); + [[maybe_unused]] auto producer_guard = ch.guard(); + + std::string index_path = + composites::dft::internal::determine_index_path(file_path, index_dir); + + auto meta_input = + composites::dft::MetadataCollectorUtilityInput::from_file(file_path) + .with_checkpoint_size(checkpoint_size) + .with_force_rebuild(force_rebuild) + .with_index(index_path); + auto metadata = + co_await composites::dft::MetadataCollectorUtility{}.process( + meta_input); + + if (!metadata.success) { + DFTRACER_UTILS_LOG_WARN("Skipping file: %s", file_path.c_str()); + co_return; + } + + FileChunkMapperUtility file_mapper; + auto mapper_input = FileChunkMapperInput::from_metadata(metadata) + .with_config(agg_config) + .with_checkpoint_size(checkpoint_size) + .with_target_chunk_size(CHUNK_SIZE_MB) + .with_batch_size(BATCH_SIZE_MB * 1024 * 1024); + mapper_input.query = query; + auto file_chunks = co_await file_mapper.process(mapper_input); + + int start_idx = + global_chunk_idx_ptr->fetch_add(static_cast(file_chunks.size())); + for (int i = 0; i < static_cast(file_chunks.size()); ++i) { + file_chunks[i].chunk_index = start_idx + i; + } - EventAggregatorUtility merger; + for (auto& chunk : file_chunks) { + if (!co_await ch.send(std::move(chunk))) { + co_return; + } + } + co_return; +} + +static coro::CoroTask chunk_worker_task( + std::shared_ptr> chunk_chan, + coro::ChannelProducer rp, + std::shared_ptr> result_chan, + CoroScope* wctx_ptr) { + [[maybe_unused]] auto producer_guard = rp.guard(); + while (auto input = co_await wctx_ptr->receive(chunk_chan)) { + ChunkAggregatorUtility agg; + auto output = co_await agg.process(*input); + if (!co_await result_chan->send(std::move(output))) { + co_return; + } + } + co_return; +} + +static coro::CoroTask run_aggregation( + CoroScope& ctx, const std::vector& input_files, + const AggregationConfig& agg_config, + const std::optional& query, + const std::string& index_dir, std::size_t checkpoint_size, + bool force_rebuild, std::size_t executor_threads) { + EventAggregator merger; std::atomic global_chunk_idx{0}; - auto streaming_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - auto chunk_chan = coro::make_channel(0); - auto result_chan = coro::make_channel(2); - - co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask { - for (const auto& file_path : input_files) { - auto* global_chunk_idx_ptr = &global_chunk_idx; - scope.spawn([file_path, ch = chunk_chan->producer(), - index_dir, checkpoint_size, force_rebuild, - agg_config, query, global_chunk_idx_ptr]( - CoroScope& /*fctx*/) mutable - -> coro::CoroTask { - [[maybe_unused]] auto producer_guard = ch.guard(); - - std::string index_path = - composites::dft::internal::determine_index_path( - file_path, index_dir); - - auto meta_input = - composites::dft::MetadataCollectorUtilityInput:: - from_file(file_path) - .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(force_rebuild) - .with_index(index_path); - auto metadata = - co_await composites::dft::MetadataCollectorUtility{} - .process(meta_input); - - if (!metadata.success) { - DFTRACER_UTILS_LOG_WARN("Skipping file: %s", - file_path.c_str()); - co_return; - } - - FileChunkMapperUtility file_mapper; - auto mapper_input = - FileChunkMapperInput::from_metadata(metadata) - .with_config(agg_config) - .with_checkpoint_size(checkpoint_size) - .with_target_chunk_size(CHUNK_SIZE_MB) - .with_batch_size(BATCH_SIZE_MB * 1024 * 1024); - mapper_input.query = query; - auto file_chunks = - co_await file_mapper.process(mapper_input); - - int start_idx = global_chunk_idx_ptr->fetch_add( - static_cast(file_chunks.size())); - for (int i = 0; - i < static_cast(file_chunks.size()); ++i) { - file_chunks[i].chunk_index = start_idx + i; - } - - for (auto& chunk : file_chunks) { - if (!co_await ch.send(std::move(chunk))) { - co_return; - } - } - co_return; - }); - } + co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask { + auto chunk_chan = coro::make_channel(0); + auto result_chan = coro::make_channel(2); + + for (const auto& file_path : input_files) { + auto* global_chunk_idx_ptr = &global_chunk_idx; + scope.spawn([file_path, ch = chunk_chan->producer(), index_dir, + checkpoint_size, force_rebuild, agg_config, query, + global_chunk_idx_ptr](CoroScope& /*fctx*/) mutable + -> coro::CoroTask { + co_await process_file_task( + std::move(file_path), std::move(ch), std::move(index_dir), + checkpoint_size, force_rebuild, std::move(agg_config), + std::move(query), global_chunk_idx_ptr); + }); + } - for (std::size_t w = 0; w < executor_threads; ++w) { - (void)w; - scope.spawn( - [chunk_chan, rp = result_chan->producer(), result_chan]( + for (std::size_t w = 0; w < executor_threads; ++w) { + (void)w; + scope.spawn([chunk_chan, rp = result_chan->producer(), result_chan]( CoroScope& wctx) mutable -> coro::CoroTask { - [[maybe_unused]] auto producer_guard = rp.guard(); - while (auto input = - co_await wctx.receive(chunk_chan)) { - ChunkAggregatorUtility agg; - auto output = co_await agg.process(*input); - if (!co_await result_chan->send( - std::move(output))) { - co_return; - } - } - co_return; - }); - } - - auto* merger_ptr = &merger; - scope.spawn([result_chan, merger_ptr]( - CoroScope& mctx) -> coro::CoroTask { - while (auto output = co_await mctx.receive(result_chan)) { - merger_ptr->merge_chunk(std::move(*output)); - } - co_return; - }); + co_await chunk_worker_task(chunk_chan, std::move(rp), + result_chan, &wctx); + }); + } + auto* merger_ptr = &merger; + scope.spawn( + [result_chan, merger_ptr](CoroScope& mctx) -> coro::CoroTask { + while (auto output = co_await mctx.receive(result_chan)) { + merger_ptr->merge_chunk(std::move(*output)); + } co_return; }); - co_return; - }, - "StreamingAggregate"); - - EventAggregatorUtilityOutput result; - auto post_task = make_task( - [&](CoroScope& /*ctx*/) -> coro::CoroTask { - result = merger.finalize(); - co_return result.success; - }, - "Finalize"); + co_return; + }); - post_task->depends_on(streaming_task); - pipeline.set_source(streaming_task); - pipeline.set_destination(post_task); - pipeline.execute(); + co_return merger.finalize(); +} - co_return result; +struct AggSpec { + AggregationConfig agg_cfg; + std::optional query; + const ComparisonNode* visitor; +}; + +struct NodeAggPlan { + ComparisonNode root; + std::vector specs; +}; + +static coro::CoroTask run_all_aggregations( + CoroScope& ctx, const std::vector& files, + const std::vector& plans, + std::vector>& results, + const std::string& index_dir, const ComparisonConfig& config) { + results.resize(plans.size()); + for (std::size_t ni = 0; ni < plans.size(); ++ni) { + const auto& plan = plans[ni]; + results[ni].resize(plan.specs.size()); + for (std::size_t vi = 0; vi < plan.specs.size(); ++vi) { + const auto& spec = plan.specs[vi]; + results[ni][vi] = co_await run_aggregation( + ctx, files, spec.agg_cfg, spec.query, index_dir, + config.checkpoint_size, config.force_rebuild, + config.executor_threads); + } + } } } // namespace -static coro::CoroTask run_comparator(argparse::ArgumentParser& program) { - std::string config_path = program.get("--config"); - std::string baseline_path = program.get("--baseline"); - std::string variant_path = program.get("--variant"); - std::string query_str = program.get("--query"); - std::string group_by_str = program.get("--group-by"); - std::string format = program.get("--format"); - bool no_color = program.get("--no-color"); - std::size_t executor_threads = - program.get("--executor-threads"); - std::string index_dir = program.get("--index-dir"); - bool force_rebuild = program.get("--force"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - double threshold = program.get("--threshold"); - double time_interval_ms = program.get("--time-interval"); +static int run_comparator(const ComparatorArgParse* cli) { + const auto& config_path = cli->config_path; + const auto& baseline_path = cli->baseline; + const auto& variant_path = cli->variant; + const auto& query_str = cli->query_args.query; + const auto& group_by_str = cli->group_by; + auto format = cli->format; + auto no_color = cli->no_color; + auto force_rebuild = cli->indexing.force; + auto checkpoint_size = cli->indexing.checkpoint_size; + auto threshold = cli->threshold; + auto time_interval_ms = cli->time_interval; ComparisonConfig config; if (!config_path.empty()) { @@ -201,7 +303,7 @@ static coro::CoroTask run_comparator(argparse::ArgumentParser& program) { auto parsed = ComparisonConfig::from_json_file(config_path, error); if (!parsed) { DFTRACER_UTILS_LOG_ERROR("Config error: %s", error.c_str()); - co_return 1; + return 1; } config = std::move(*parsed); } else if (!baseline_path.empty() && !variant_path.empty()) { @@ -210,15 +312,18 @@ static coro::CoroTask run_comparator(argparse::ArgumentParser& program) { } else { DFTRACER_UTILS_LOG_ERROR( "Must specify --config or both --baseline and --variant"); - co_return 1; + return 1; } - // CLI overrides if (!format.empty()) config.format = format; config.no_color = no_color; - if (executor_threads > 0) config.executor_threads = executor_threads; + if (cli->pipeline.executor_threads > 0) + config.executor_threads = cli->pipeline.executor_threads; if (checkpoint_size > 0) config.checkpoint_size = checkpoint_size; - if (!index_dir.empty()) config.index_dir = index_dir; + if (!cli->baseline_index_dir.empty()) + config.baseline_index_dir = cli->baseline_index_dir; + if (!cli->variant_index_dir.empty()) + config.variant_index_dir = cli->variant_index_dir; if (force_rebuild) config.force_rebuild = force_rebuild; if (threshold > 0.0) config.defaults.threshold_pct = threshold; if (time_interval_ms > 0.0) @@ -229,207 +334,304 @@ static coro::CoroTask run_comparator(argparse::ArgumentParser& program) { if (config.executor_threads == 0) { config.executor_threads = dftracer_utils_hardware_concurrency(); } + if (config.checkpoint_size == 0) { config.checkpoint_size = indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE; } - std::string temp_index_dir; - if (config.index_dir.empty()) { - try { - auto temp_path = fs::temp_directory_path(); - temp_path /= "dftracer_cmp_" + std::to_string(std::time(nullptr)) + - "_" + std::to_string(getpid()); - temp_index_dir = temp_path.string(); - fs::create_directories(temp_index_dir); - config.index_dir = temp_index_dir; - } catch (const fs::filesystem_error& e) { - temp_index_dir = "/tmp/dftracer_cmp_" + - std::to_string(std::time(nullptr)) + "_" + - std::to_string(getpid()); - fs::create_directories(temp_index_dir); - config.index_dir = temp_index_dir; - DFTRACER_UTILS_LOG_WARN( - "Failed to get system temp directory, using /tmp: %s", - e.what()); - } - } + // Precompute aggregation plans from config (needed by both Agg tasks) + std::vector agg_plans; + for (auto& node : config.nodes) { + NodeAggPlan plan; + plan.root = node; - // Enumerate files for both sides - auto enumerate_files = [](const std::string& path) - -> coro::CoroTask> { - std::vector files; - if (fs::is_regular_file(path)) { - files.push_back(path); - co_return files; - } - filesystem::PatternDirectoryScannerUtility scanner; - filesystem::PatternDirectoryScannerUtilityInput scan_input{ - path, {".pfw", ".pfw.gz"}, false}; - auto entries = co_await scanner.process(scan_input); - files.reserve(entries.size()); - for (const auto& e : entries) { - files.push_back(e.path.string()); - } - co_return files; - }; + std::vector visitors; + flatten_nodes(node, visitors); - auto baseline_files = co_await enumerate_files(config.baseline); - auto variant_files = co_await enumerate_files(config.variant); + for (const auto* visitor : visitors) { + AggSpec spec; + if (!visitor->composed_query.empty()) { + auto result = + common::query::Query::from_string(visitor->composed_query); + if (!result) { + DFTRACER_UTILS_LOG_ERROR("Invalid query for node '%s': %s", + visitor->name.c_str(), + result.error().format().c_str()); + return 1; + } + spec.query = std::move(*result); + } - if (baseline_files.empty()) { - DFTRACER_UTILS_LOG_ERROR("No trace files found in baseline: %s", - config.baseline.c_str()); - co_return 1; - } - if (variant_files.empty()) { - DFTRACER_UTILS_LOG_ERROR("No trace files found in variant: %s", - config.variant.c_str()); - co_return 1; + spec.agg_cfg.time_interval_us = static_cast( + config.defaults.time_interval_ms * 1000.0); + spec.agg_cfg.extra_group_keys = {}; + spec.agg_cfg.compute_statistics = true; + spec.agg_cfg.compute_percentiles = true; + spec.agg_cfg.percentiles = visitor->resolved_percentiles; + spec.agg_cfg.sketch_accuracy = 0.01; + spec.agg_cfg.track_process_parents = false; + spec.visitor = visitor; + + plan.specs.push_back(std::move(spec)); + } + agg_plans.push_back(std::move(plan)); } - // Build indexes upfront so parallel aggregation doesn't race on - // `.dftindex`. - { - if (config.force_rebuild && !baseline_files.empty()) { - const std::string shared_index_path = - composites::dft::internal::determine_index_path( - baseline_files.front(), config.index_dir); - if (fs::exists(shared_index_path)) { - DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s", - shared_index_path.c_str()); - fs::remove_all(shared_index_path); - } + auto pipeline_config = + cli::build_pipeline_config("DFTracer Comparator", cli->pipeline); + Pipeline pipeline(pipeline_config); + + auto resolve_and_build = + [&config](CoroScope& scope, const std::string& path, + const std::string& index_dir, + std::vector& out_files) -> coro::CoroTask { + if (!fs::exists(path)) { + DFTRACER_UTILS_LOG_ERROR("Path does not exist: %s", path.c_str()); + co_return; } - std::unordered_set seen; - std::vector all_files; - for (const auto& f : baseline_files) { - if (seen.insert(f).second) all_files.push_back(f); + + IndexResolverUtility resolver; + ResolverInput resolve_input; + resolve_input.index_dir = index_dir; + resolve_input.require_checkpoints = !config.force_rebuild; + if (fs::is_regular_file(path)) { + resolve_input.files = {path}; + } else { + resolve_input.directory = path; } - for (const auto& f : variant_files) { - if (seen.insert(f).second) all_files.push_back(f); + + auto result = co_await resolver.process(resolve_input); + out_files = std::move(result.all_files); + + if (out_files.empty()) { + DFTRACER_UTILS_LOG_ERROR("No trace files found in: %s", + path.c_str()); + co_return; } - DFTRACER_UTILS_LOG_INFO("Building indexes for %zu unique files...", - all_files.size()); - std::vector idx_configs; - idx_configs.reserve(all_files.size()); - for (const auto& file_path : all_files) { - idx_configs.push_back( - indexer::IndexBuildConfig::for_file(file_path) - .with_checkpoint_size(config.checkpoint_size) - .with_force_rebuild(false) - .with_index_dir(config.index_dir)); + + if (result.needs_checkpoint.empty()) { + DFTRACER_UTILS_LOG_INFO("All %zu files already indexed", + out_files.size()); + co_return; } - std::vector> idx_tasks; - idx_tasks.reserve(idx_configs.size()); - for (const auto& cfg : idx_configs) { - idx_tasks.push_back(indexer::IndexBuilderUtility{}.process(cfg)); + + auto batch_cfg = std::make_shared(); + batch_cfg->file_paths.reserve(result.needs_checkpoint.size()); + for (const auto& item : result.needs_checkpoint) { + batch_cfg->file_paths.push_back(item.file_path); } - co_await coro::when_all(std::move(idx_tasks)); + batch_cfg->index_dir = index_dir; + batch_cfg->checkpoint_size = config.checkpoint_size; + batch_cfg->parallelism = config.executor_threads; + batch_cfg->force_rebuild = config.force_rebuild; + batch_cfg->use_batch_write = true; + batch_cfg->rebuild_root_summaries = true; + + DFTRACER_UTILS_LOG_INFO("Indexing %zu of %zu files...", + result.needs_checkpoint.size(), + out_files.size()); + co_await IndexBatchBuilderUtility::process(&scope, + std::move(batch_cfg)); + }; + + // Shared state between tasks + std::vector baseline_files; + std::vector variant_files; + std::vector> baseline_results; + std::vector> variant_results; + + auto baseline_index_path = composites::dft::internal::determine_index_path( + config.baseline, config.baseline_index_dir); + auto variant_index_path = composites::dft::internal::determine_index_path( + config.variant, config.variant_index_dir); + bool shared_index = baseline_index_path == variant_index_path; + + std::shared_ptr enum_index_base; + std::shared_ptr enum_index_var; + + if (shared_index) { + auto enum_index_shared = make_task( + [&config, &baseline_files, &variant_files, + &resolve_and_build](CoroScope& scope) -> coro::CoroTask { + co_await resolve_and_build(scope, config.baseline, + config.baseline_index_dir, + baseline_files); + if (config.baseline == config.variant) { + variant_files = baseline_files; + } else { + co_await resolve_and_build(scope, config.variant, + config.variant_index_dir, + variant_files); + } + }, + "EnumIndex"); + enum_index_base = enum_index_shared; + enum_index_var = enum_index_shared; + } else { + enum_index_base = make_task( + [&config, &baseline_files, + &resolve_and_build](CoroScope& scope) -> coro::CoroTask { + co_await resolve_and_build(scope, config.baseline, + config.baseline_index_dir, + baseline_files); + }, + "EnumIndexBaseline"); + + enum_index_var = make_task( + [&config, &variant_files, + &resolve_and_build](CoroScope& scope) -> coro::CoroTask { + co_await resolve_and_build(scope, config.variant, + config.variant_index_dir, + variant_files); + }, + "EnumIndexVariant"); } + std::shared_ptr agg_base; + std::shared_ptr agg_var; + + bool same_files = shared_index && config.baseline == config.variant; + if (same_files) { + auto agg_shared = make_task( + [&baseline_files, &baseline_results, &variant_results, &agg_plans, + &config](CoroScope& ctx) -> coro::CoroTask { + if (baseline_files.empty()) co_return; + co_await run_all_aggregations( + ctx, baseline_files, agg_plans, baseline_results, + config.baseline_index_dir, config); + variant_results = baseline_results; + }, + "Aggregate"); + agg_shared->depends_on(enum_index_base); + agg_base = agg_shared; + agg_var = agg_shared; + } else { + agg_base = make_task( + [&baseline_files, &baseline_results, &agg_plans, + &config](CoroScope& ctx) -> coro::CoroTask { + if (baseline_files.empty()) co_return; + co_await run_all_aggregations( + ctx, baseline_files, agg_plans, baseline_results, + config.baseline_index_dir, config); + }, + "AggBaseline"); + agg_base->depends_on(enum_index_base); + + agg_var = make_task( + [&variant_files, &variant_results, &agg_plans, + &config](CoroScope& ctx) -> coro::CoroTask { + if (variant_files.empty()) co_return; + co_await run_all_aggregations(ctx, variant_files, agg_plans, + variant_results, + config.variant_index_dir, config); + }, + "AggVariant"); + agg_var->depends_on(enum_index_var); + } + + // Compare (depends on both Agg tasks) ComparisonOutput output; output.baseline_path = config.baseline; output.variant_path = config.variant; - output.baseline_file_count = baseline_files.size(); - output.variant_file_count = variant_files.size(); + int result_code = 0; + + auto compare_task = make_task( + [&config, &baseline_files, &variant_files, &baseline_results, + &variant_results, &agg_plans, &output, &result_code]( + [[maybe_unused]] CoroScope& ctx) -> coro::CoroTask { + if (baseline_files.empty() || variant_files.empty()) { + result_code = 1; + co_return; + } - auto start_time = std::chrono::high_resolution_clock::now(); + output.baseline_file_count = baseline_files.size(); + output.variant_file_count = variant_files.size(); - for (auto& node : config.nodes) { - std::vector visitors; - flatten_nodes(node, visitors); + auto start_time = std::chrono::high_resolution_clock::now(); - std::vector pairs; - pairs.reserve(visitors.size()); + for (std::size_t ni = 0; ni < agg_plans.size(); ++ni) { + const auto& plan = agg_plans[ni]; + std::vector pairs; + pairs.reserve(plan.specs.size()); - for (const auto* visitor : visitors) { - using common::query::Query; - std::optional query; - if (!visitor->composed_query.empty()) { - auto result = Query::from_string(visitor->composed_query); - if (!result) { - DFTRACER_UTILS_LOG_ERROR("Invalid query for node '%s': %s", - visitor->name.c_str(), - result.error().format().c_str()); - co_return 1; + for (std::size_t vi = 0; vi < plan.specs.size(); ++vi) { + if (pairs.empty()) { + output.baseline_meta = extract_metadata( + baseline_results[ni][vi].aggregations, + baseline_files.size()); + output.variant_meta = extract_metadata( + variant_results[ni][vi].aggregations, + variant_files.size()); + } + + ComparisonVisitorPair pair; + pair.baseline = std::move(baseline_results[ni][vi]); + pair.variant = std::move(variant_results[ni][vi]); + pair.node = *plan.specs[vi].visitor; + pairs.push_back(std::move(pair)); } - query = std::move(*result); - } - AggregationConfig agg_cfg; - agg_cfg.time_interval_us = static_cast( - config.defaults.time_interval_ms * 1000.0); - agg_cfg.extra_group_keys = {}; - agg_cfg.compute_statistics = true; - agg_cfg.compute_percentiles = true; - agg_cfg.percentiles = visitor->resolved_percentiles; - agg_cfg.sketch_accuracy = 0.01; - agg_cfg.track_process_parents = false; - - auto [base_result, var_result] = co_await coro::when_all( - run_aggregation(baseline_files, agg_cfg, query, - config.index_dir, config.checkpoint_size, - config.force_rebuild, config.executor_threads), - run_aggregation(variant_files, agg_cfg, query, config.index_dir, - config.checkpoint_size, config.force_rebuild, - config.executor_threads)); - - // Extract metadata from first visitor (broadest query) - if (pairs.empty()) { - output.baseline_meta = extract_metadata( - base_result.aggregations, baseline_files.size()); - output.variant_meta = extract_metadata(var_result.aggregations, - variant_files.size()); + ComparisonUtilityInput cmp_input; + cmp_input.visitors = std::move(pairs); + cmp_input.root_node = plan.root; + cmp_input.baseline_file_count = baseline_files.size(); + cmp_input.variant_file_count = variant_files.size(); + + ComparisonUtility cmp; + auto cmp_output = co_await cmp.process(cmp_input); + output.nodes.push_back(std::move(cmp_output.result)); } - ComparisonVisitorPair pair; - pair.baseline = std::move(base_result); - pair.variant = std::move(var_result); - pair.node = *visitor; - pairs.push_back(std::move(pair)); - } + auto meta_rows = build_metadata_metrics(output.baseline_meta, + output.variant_meta); + for (auto& n : output.nodes) { + n.summary.metrics.insert(n.summary.metrics.begin(), + meta_rows.begin(), meta_rows.end()); + } - ComparisonUtilityInput cmp_input; - cmp_input.visitors = std::move(pairs); - cmp_input.root_node = node; - cmp_input.baseline_file_count = baseline_files.size(); - cmp_input.variant_file_count = variant_files.size(); + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = + end_time - start_time; + output.execution_time_ms = duration.count(); + + if (config.format == "json") { + TreeTableFormatter formatter; + std::printf("%s\n", formatter.render_json(output).c_str()); + } else { + bool is_tty = isatty(fileno(stdout)); + FormatterOptions fmt_opts; + fmt_opts.use_color = is_tty && !config.no_color; + fmt_opts.use_unicode = is_tty; + TreeTableFormatter formatter(fmt_opts); + formatter.render(stdout, output); + } - ComparisonUtility cmp; - auto cmp_output = co_await cmp.process(cmp_input); - output.nodes.push_back(std::move(cmp_output.result)); - } + co_return; + }, + "Compare"); - // Inject metadata rows into root SUMMARY. - auto meta_rows = - build_metadata_metrics(output.baseline_meta, output.variant_meta); - for (auto& node : output.nodes) { - node.summary.metrics.insert(node.summary.metrics.begin(), - meta_rows.begin(), meta_rows.end()); + if (same_files) { + compare_task->depends_on(agg_base); + } else { + compare_task->depends_on({agg_base, agg_var}); } - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = end_time - start_time; - output.execution_time_ms = duration.count(); - - if (config.format == "json") { - TreeTableFormatter formatter; - std::printf("%s\n", formatter.render_json(output).c_str()); + if (shared_index) { + pipeline.set_source(enum_index_base); } else { - bool is_tty = isatty(fileno(stdout)); - FormatterOptions fmt_opts; - fmt_opts.use_color = is_tty && !config.no_color; - fmt_opts.use_unicode = is_tty; - TreeTableFormatter formatter(fmt_opts); - formatter.render(stdout, output); + pipeline.set_source({enum_index_base, enum_index_var}); } + pipeline.set_destination(compare_task); - if (!temp_index_dir.empty() && fs::exists(temp_index_dir)) { - fs::remove_all(temp_index_dir); + try { + pipeline.execute(); + } catch (const PipelineError& e) { + DFTRACER_UTILS_LOG_ERROR("Pipeline failed: %s", e.what()); + result_code = 1; } - co_return 0; + return result_code; } int main(int argc, char** argv) { @@ -440,67 +642,9 @@ int main(int argc, char** argv) { program.add_description( "Compare DFTracer trace metrics between baseline and variant"); - program.add_argument("--config") - .help("JSON config file for hierarchical comparison") - .default_value(""); - - program.add_argument("--baseline") - .help("Baseline trace file or directory") - .default_value(""); - - program.add_argument("--variant") - .help("Variant trace file or directory") - .default_value(""); - - program.add_argument("--query") - .help("Query filter (default: all events)") - .default_value(""); - - program.add_argument("--group-by") - .help("Comma-separated group keys (default: cat,name)") - .default_value(""); - - program.add_argument("--format") - .help("Output format: table (default) or json") - .default_value("table"); - - program.add_argument("-t", "--time-interval") - .help("Time interval in milliseconds for bucketing (default: 5000)") - .scan<'g', double>() - .default_value(5000.0); - - program.add_argument("--threshold") - .help("Hide changes below this percentage") - .scan<'g', double>() - .default_value(0.0); - - program.add_argument("--no-color").help("Disable ANSI color output").flag(); - - program.add_argument("--executor-threads") - .help("Number of parallel threads (default: auto)") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - program.add_argument("--index-dir") - .help("Directory for index files (default: temp)") - .default_value(""); - - program.add_argument("--force").help("Force index rebuild").flag(); - - program.add_argument("--checkpoint-size") - .help("Checkpoint size for indexing in bytes") - .scan<'d', std::size_t>() - .default_value(static_cast( - indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE)); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what()); - std::fprintf(stderr, "%s\n", program.help().str().c_str()); - return 1; - } + ComparatorArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; - return run_comparator(program).get(); + return run_comparator(&cli); } diff --git a/src/dftracer/utils/binaries/dftracer_event_count.cpp b/src/dftracer/utils/binaries/dftracer_event_count.cpp index c5e91c21..cecbb19c 100644 --- a/src/dftracer/utils/binaries/dftracer_event_count.cpp +++ b/src/dftracer/utils/binaries/dftracer_event_count.cpp @@ -1,206 +1,254 @@ #include -#include #include -#include #include #include #include -#include #include #include +#include +#include +#include #include #include #include #include -#include -#include #include -#include +#include #include +#include +#include +#include +#include + +#include "common_cli.h" using namespace dftracer::utils; -using namespace dftracer::utils::utilities::indexer::internal; +using namespace dftracer::utils::utilities::composites::dft::indexing; +using dftracer::utils::utilities::fileio::lines::sources:: + async_streaming_gz_lines; +using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBuildBatchConfig; + +class EventCountArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + + explicit EventCountArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.index_dir_help = + "Directory to store index files (default: system temp directory)"; + schema(directory, pipeline, indexing); + } +}; + +static int run_event_count(const EventCountArgParse* cli); + +struct EventCountBatchResult { + std::size_t total_events = 0; + std::size_t files_processed = 0; + bool is_approximate = false; +}; + +static EventCountBatchResult process_index_group_event_counts_sync( + std::string index_path, std::vector entries) { + std::vector file_ids; + file_ids.reserve(entries.size()); + for (const auto& entry : entries) { + file_ids.push_back(entry.file_id); + } + + EventCountBatchResult batch_result; + + utilities::indexer::IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + + auto metadata_rows = db.query_file_metadata_batch(file_ids); + auto merged_stats = db.query_merged_statistics_batch(file_ids); + + for (const auto file_id : file_ids) { + auto merged_it = merged_stats.find(file_id); + if (merged_it != merged_stats.end() && + merged_it->second.num_chunks > 0) { + batch_result.total_events += + static_cast(merged_it->second.stats.total_events); + batch_result.files_processed++; + continue; + } + + auto metadata_it = metadata_rows.find(file_id); + if (metadata_it != metadata_rows.end()) { + batch_result.total_events += + static_cast(metadata_it->second.num_lines); + batch_result.files_processed++; + batch_result.is_approximate = true; + } + } -static coro::CoroTask run_event_count(argparse::ArgumentParser& program); + return batch_result; +} + +static coro::CoroTask process_index_group_event_counts( + std::shared_ptr index_path, + std::shared_ptr> entries) { + co_return process_index_group_event_counts_sync(std::move(*index_path), + std::move(*entries)); +} int main(int argc, char** argv) { DFTRACER_UTILS_LOGGER_INIT(); - auto default_checkpoint_size_str = - std::to_string(Indexer::DEFAULT_CHECKPOINT_SIZE) + " B (" + - std::to_string(Indexer::DEFAULT_CHECKPOINT_SIZE / (1024 * 1024)) + - " MB)"; - argparse::ArgumentParser program("dftracer_event_count", DFTRACER_UTILS_PACKAGE_VERSION); program.add_description( "Count valid events in DFTracer .pfw or .pfw.gz files using composable " "utilities and pipeline processing"); - program.add_argument("-d", "--directory") - .help("Directory containing .pfw or .pfw.gz files") - .default_value("."); - - program.add_argument("-f", "--force").help("Force index recreation").flag(); - - program.add_argument("-c", "--checkpoint-size") - .help("Checkpoint size for indexing in bytes (default: " + - default_checkpoint_size_str + ")") - .scan<'d', std::size_t>() - .default_value( - static_cast(Indexer::DEFAULT_CHECKPOINT_SIZE)); - - program.add_argument("--executor-threads") - .help( - "Number of executor threads for parallel processing (default: " - "number of CPU cores)") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - program.add_argument("--index-dir") - .help("Directory to store index files (default: system temp directory)") - .default_value(""); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what()); - std::cerr << program; - return 1; - } + EventCountArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; - return run_event_count(program).get(); + return run_event_count(&cli); } -static coro::CoroTask run_event_count(argparse::ArgumentParser& program) { - // Parse arguments - std::string log_dir = program.get("--directory"); - bool force_rebuild = program.get("--force"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - std::size_t executor_threads = - program.get("--executor-threads"); - std::string index_dir = program.get("--index-dir"); - - // If no index dir specified, indices are stored next to trace files - // (default IndexBuilderUtility behavior). This allows reuse of - // indices built by dftracer_index. - - log_dir = fs::absolute(log_dir).string(); - - // Discover input files - utilities::filesystem::PatternDirectoryScannerUtility scanner; - utilities::filesystem::PatternDirectoryScannerUtilityInput scan_input{ - log_dir, {".pfw", ".pfw.gz"}}; - auto matched_entries = scanner.process(scan_input).get(); - - std::vector input_files; - input_files.reserve(matched_entries.size()); - for (const auto& entry : matched_entries) { - input_files.push_back(entry.path.string()); - } +static int run_event_count(const EventCountArgParse* cli) { + const auto log_dir = fs::absolute(cli->directory.value).string(); + const auto index_dir = cli->indexing.index_dir; + const auto checkpoint_size = cli->indexing.checkpoint_size; + const auto force_rebuild = cli->indexing.force; + const auto executor_threads = cli->pipeline.executor_threads; + + IndexResolverUtility resolver; + ResolverInput resolve_input; + resolve_input.directory = log_dir; + resolve_input.index_dir = index_dir; + resolve_input.require_checkpoints = !force_rebuild; - if (input_files.empty()) { + auto resolve_result = resolver.process(resolve_input).get(); + + if (resolve_result.all_files.empty()) { DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s", log_dir.c_str()); - co_return 1; + return 1; } - auto pipeline_config = PipelineConfig() - .with_name("DFTracer Event Count") - .with_compute_threads(executor_threads) - .with_watchdog(false); - - Pipeline pipeline(pipeline_config); - auto start_time = std::chrono::high_resolution_clock::now(); std::atomic total_events{0}; std::atomic files_processed{0}; std::atomic is_approximate{false}; - auto count_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask { - auto* files_ptr = &input_files; - auto* total_events_ptr = &total_events; - auto* files_processed_ptr = &files_processed; - auto* is_approximate_ptr = &is_approximate; - auto file_chan = - coro::make_channel(executor_threads * 2); - - // Producer - scope.spawn([ch = file_chan->producer(), - num_files = input_files.size()]( - CoroScope&) mutable -> coro::CoroTask { - auto guard = ch.guard(); - for (std::size_t i = 0; i < num_files; ++i) { - if (!co_await ch.send(i)) co_return; - } - co_return; - }); + std::vector direct_scan_items; + std::vector indexed_entries = + std::move(resolve_result.cached); + std::string index_path = resolve_result.index_path; - // Workers: build index if needed, then read count from DB - for (std::size_t w = 0; w < executor_threads; ++w) { - scope.spawn([file_chan, files_ptr, checkpoint_size, - force_rebuild, &index_dir, total_events_ptr, - files_processed_ptr, is_approximate_ptr]( - CoroScope&) -> coro::CoroTask { - while (auto fi_opt = co_await file_chan->receive()) { - const auto& fp = (*files_ptr)[*fi_opt]; - - // Build index if needed - utilities::indexer::IndexBuilderUtility builder; - auto config = - utilities::indexer::IndexBuildConfig::for_file( - fp) - .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(force_rebuild) - .with_index_dir(index_dir); - co_await builder.process(config); - - // Read event count from index - std::string index_path = - fp + constants::indexer::EXTENSION; - if (!index_dir.empty()) { - auto fname = fs::path(fp).filename(); - index_path = - (fs::path(index_dir) / fname).string() + - constants::indexer::EXTENSION; - } + std::vector needs_checkpoint = + std::move(resolve_result.needs_checkpoint); + + auto pipeline_config = + cli::build_pipeline_config("DFTracer Event Count", cli->pipeline); + Pipeline pipeline(pipeline_config); - if (fs::exists(index_path)) { - try { - utilities::indexer::IndexDatabase db( - index_path); - int fid = db.find_file(fp); - if (fid >= 0) { - if (!db.has_bloom_data(fid)) { - is_approximate_ptr->store( - true, - std::memory_order_relaxed); - } - total_events_ptr->fetch_add( - db.get_total_events(fid), - std::memory_order_relaxed); - files_processed_ptr->fetch_add( - 1, std::memory_order_relaxed); - continue; - } - } catch (...) { + auto build_task = make_task( + [&needs_checkpoint, index_dir, checkpoint_size, executor_threads, + force_rebuild](CoroScope& scope) -> coro::CoroTask { + if (needs_checkpoint.empty()) { + co_return; + } + auto batch_config = std::make_shared(); + batch_config->file_paths.reserve(needs_checkpoint.size()); + for (const auto& item : needs_checkpoint) { + batch_config->file_paths.push_back(item.file_path); + } + batch_config->index_dir = index_dir; + batch_config->checkpoint_size = checkpoint_size; + batch_config->parallelism = executor_threads; + batch_config->force_rebuild = force_rebuild; + batch_config->use_batch_write = true; + batch_config->rebuild_root_summaries = true; + co_await IndexBatchBuilderUtility::process(&scope, + std::move(batch_config)); + }, + "BatchIndex"); + + auto count_task = make_task( + [&needs_checkpoint, &indexed_entries, &direct_scan_items, &total_events, + &files_processed, &is_approximate, index_dir, index_path, + executor_threads](CoroScope& ctx) -> coro::CoroTask { + if (!needs_checkpoint.empty()) { + IndexResolverUtility re_resolver; + ResolverInput refresh_input; + std::vector newly_indexed; + newly_indexed.reserve(needs_checkpoint.size()); + for (const auto& item : needs_checkpoint) { + newly_indexed.push_back(item.file_path); + } + refresh_input.files = std::move(newly_indexed); + refresh_input.index_dir = index_dir; + refresh_input.require_checkpoints = true; + + auto refresh_result = + co_await re_resolver.process(refresh_input); + for (auto& entry : refresh_result.cached) { + indexed_entries.push_back(std::move(entry)); + } + for (auto& item : refresh_result.needs_checkpoint) { + direct_scan_items.push_back(std::move(item)); + } + } + + if (!indexed_entries.empty()) { + auto idx_path_ptr = std::make_shared(index_path); + auto entries_ptr = std::make_shared>( + std::move(indexed_entries)); + try { + auto batch_result = + co_await process_index_group_event_counts( + std::move(idx_path_ptr), std::move(entries_ptr)); + total_events.fetch_add(batch_result.total_events, + std::memory_order_relaxed); + files_processed.fetch_add(batch_result.files_processed, + std::memory_order_relaxed); + if (batch_result.is_approximate) { + is_approximate.store(true, std::memory_order_relaxed); + } + } catch (...) { + is_approximate.store(true, std::memory_order_relaxed); + } + } + + if (!direct_scan_items.empty()) { + is_approximate.store(true, std::memory_order_relaxed); + co_await ctx.scope([&](CoroScope& scope) + -> coro::CoroTask { + auto file_chan = + coro::make_channel(executor_threads * 2); + + scope.spawn( + [ch = file_chan->producer(), + items_ptr = &direct_scan_items]( + CoroScope&) mutable -> coro::CoroTask { + auto guard = ch.guard(); + for (const auto& item : *items_ptr) { + if (!co_await ch.send(item)) { + co_return; } } - - // Fallback for small/unindexed files: - // stream decompress and count lines (approximate) - is_approximate_ptr->store( - true, std::memory_order_relaxed); - { - using utilities::fileio::lines::sources:: - async_streaming_gz_lines; + co_return; + }); + + for (std::size_t w = 0; w < executor_threads; ++w) { + scope.spawn([ch = file_chan->consumer(), + total_events_ptr = &total_events, + files_processed_ptr = &files_processed]( + CoroScope&) -> coro::CoroTask { + while (auto item_opt = co_await ch.receive()) { std::size_t count = 0; - auto gen = async_streaming_gz_lines(fp); + auto gen = async_streaming_gz_lines( + item_opt->file_path); while (co_await gen.next()) { ++count; } @@ -209,17 +257,18 @@ static coro::CoroTask run_event_count(argparse::ArgumentParser& program) { files_processed_ptr->fetch_add( 1, std::memory_order_relaxed); } - } - co_return; - }); - } - co_return; - }); + co_return; + }); + } + co_return; + }); + } co_return; }, - "EventCount"); + "Count"); - pipeline.set_source(count_task); + count_task->depends_on(build_task); + pipeline.set_source(build_task); pipeline.set_destination(count_task); pipeline.execute(); @@ -235,5 +284,5 @@ static coro::CoroTask run_event_count(argparse::ArgumentParser& program) { DFTRACER_UTILS_LOG_DEBUG("Completed in %.2f ms", duration.count()); DFTRACER_UTILS_LOG_DEBUG("Files processed: %zu", files_processed.load()); - co_return 0; + return 0; } diff --git a/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp b/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp index eb3f4c45..4c496127 100644 --- a/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp +++ b/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp @@ -1,9 +1,9 @@ #include #include -#include #include #include -#include +#include +#include #include #include #include @@ -14,35 +14,41 @@ #include #include #include +#include #include #include -#include #include #include #include #include #include +#include "common_cli.h" + using namespace dftracer::utils; using namespace dftracer::utils::utilities; using namespace dftracer::utils::utilities::composites::dft; using namespace dftracer::utils::utilities::composites::dft::indexing; namespace compression = dftracer::utils::utilities::compression; namespace util_io = dftracer::utils::utilities::fileio; -using dftracer::utils::utilities::indexer::IndexBuildConfig; -using dftracer::utils::utilities::indexer::IndexBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBuildBatchConfig; using dftracer::utils::utilities::indexer::IndexDatabase; using dftracer::utils::utilities::indexer::internal::get_logical_path; // --------------------------------------------------------------------------- -// TraceWriter – compresses via ManualStreamingCompressorUtility and writes +// TraceWriter - compresses via ManualStreamingCompressorUtility and writes // via StreamingFileWriterUtility. Natural deflate blocks // provide block boundaries for the gzip indexer. // --------------------------------------------------------------------------- class TraceWriter { public: - explicit TraceWriter(const std::string& path) : writer_(path) {} + explicit TraceWriter(const std::string& path, + std::size_t flush_threshold = 4 * 1024 * 1024) + : writer_(path), flush_threshold_(flush_threshold) { + buf_.reserve(flush_threshold * 2); + } ~TraceWriter() { close(); } @@ -50,16 +56,26 @@ class TraceWriter { TraceWriter& operator=(const TraceWriter&) = delete; void write(const std::string& s) { - [this, &s]() -> coro::CoroTask { - auto gen = compressor_.compress(ByteView(s)); + buf_ += s; + if (buf_.size() >= flush_threshold_) { + flush(); + } + } + + void flush() { + if (buf_.empty()) return; + [this]() -> coro::CoroTask { + auto gen = compressor_.compress(ByteView(buf_)); while (auto chunk = co_await gen.next()) { co_await writer_.process(*chunk); } }() - .get(); + .get(); + buf_.clear(); } void close() { + flush(); [this]() -> coro::CoroTask { auto gen = compressor_.finalize_stream(); while (auto chunk = co_await gen.next()) { @@ -73,6 +89,8 @@ class TraceWriter { private: compression::zlib::ManualStreamingCompressorUtility compressor_; util_io::StreamingFileWriterUtility writer_; + std::string buf_; + std::size_t flush_threshold_; }; // --------------------------------------------------------------------------- @@ -216,7 +234,7 @@ struct QuerySpec { }; static coro::CoroTask run_verify( - const std::vector& file_paths, + CoroScope& scope, const std::vector& file_paths, const std::vector& queries, std::size_t ckpt_size) { // Extra dimensions: arbitrary dot-paths into args std::vector extra_dims = {"ret", "count", "offset", "epoch", @@ -237,17 +255,33 @@ static coro::CoroTask run_verify( std::printf("Verify: building bloom indices\n"); std::printf("==========================================\n"); + // Batch-build gzip indexes for all files + { + std::vector abs_paths; + abs_paths.reserve(file_paths.size()); + for (const auto& fp : file_paths) { + abs_paths.push_back(fs::absolute(fp).string()); + } + + auto index_path = internal::determine_index_path(abs_paths.front(), ""); + dftracer::utils::rocksdb::RocksDBManager::instance().reset(index_path); + + auto batch_config = std::make_shared(); + batch_config->file_paths = std::move(abs_paths); + batch_config->checkpoint_size = ckpt_size; + batch_config->parallelism = file_paths.size(); + batch_config->use_batch_write = true; + batch_config->rebuild_root_summaries = true; + + co_await IndexBatchBuilderUtility::process(&scope, + std::move(batch_config)); + } + for (const auto& file_path : file_paths) { std::string abs_path = fs::absolute(file_path).string(); - - // 1. Build gzip index std::string index_path = internal::determine_index_path(abs_path, ""); - auto idx_input = IndexBuildConfig::for_file(abs_path) - .with_checkpoint_size(ckpt_size) - .with_force_rebuild(true); - co_await IndexBuilderUtility{}.process(idx_input); - // 2. Collect metadata + // Collect metadata auto meta_input = MetadataCollectorUtilityInput::from_file(abs_path) .with_checkpoint_size(ckpt_size) .with_force_rebuild(false) @@ -265,16 +299,16 @@ static coro::CoroTask run_verify( std::string idx_path_bidx = internal::determine_index_path(abs_path, ""); IndexDatabase idx_db(idx_path_bidx); - idx_db.init_base_schema(); - idx_db.init_bloom_schema(); + auto writer = idx_db.begin_write(); + writer->init_schema(); std::uint64_t file_hash_val = 0; if (fs::exists(abs_path)) { file_hash_val = static_cast(fs::file_size(abs_path)); } - int fid = idx_db.get_or_create_file_info(get_logical_path(abs_path), - file_hash_val); + int fid = writer->get_or_create_file_info( + get_logical_path(abs_path), file_hash_val); std::size_t file_size = metadata.uncompressed_size; std::size_t num_ckpts = metadata.num_checkpoints; @@ -299,7 +333,6 @@ static coro::CoroTask run_verify( } } - idx_db.begin_transaction(); std::unordered_map file_blooms; HashResolutions all_hr; std::size_t total_events = 0; @@ -320,7 +353,7 @@ static coro::CoroTask run_verify( for (auto& [dim, bloom] : output.bloom_filters) { auto blob = bloom.serialize(); - idx_db.insert_chunk_bloom_filter( + writer->insert_chunk_bloom_filter( fid, output.checkpoint_idx, dim, blob.data(), static_cast(blob.size()), bloom.num_entries()); @@ -332,8 +365,8 @@ static coro::CoroTask run_verify( } } - idx_db.insert_chunk_statistics(fid, output.checkpoint_idx, - output.statistics); + writer->insert_chunk_statistics(fid, output.checkpoint_idx, + output.statistics); for (auto& [dim, resolutions] : output.hash_resolutions) { for (auto& [h, resolved] : resolutions) { @@ -344,20 +377,14 @@ static coro::CoroTask run_verify( for (auto& [dim, bloom] : file_blooms) { auto blob = bloom.serialize(); - idx_db.insert_file_bloom_filter(fid, dim, blob.data(), - static_cast(blob.size()), - bloom.num_entries()); - } - for (const auto& [dim, resolutions] : all_hr) { - for (const auto& [h, resolved] : resolutions) { - idx_db.insert_hash_resolution(fid, dim, h, resolved); - } + writer->insert_file_bloom_filter(fid, dim, blob.data(), + static_cast(blob.size()), + bloom.num_entries()); } for (const auto& dim : all_dimensions) { - idx_db.insert_index_dimension(fid, dim); + writer->insert_index_dimension(fid, dim); } - - idx_db.commit_transaction(); + writer->commit(); std::string basename = fs::path(abs_path).filename().string(); std::printf(" %s: indexed (%zu events, %zu chunks)\n", @@ -435,6 +462,117 @@ static coro::CoroTask run_verify( co_return 0; } +class GenFakeTraceArgParse : public cli::ArgParse { + public: + cli::PipelineArgs pipeline; + + std::string output_dir; + int num_ranks = 8; + int num_hosts = 4; + int num_epochs = 500; + int steps_per_epoch = 1000; + int checkpoint_every = 5; + int validation_every = 2; + int num_train_files = 8; + int num_val_files = 2; + int step_dur_ms = 100; + std::uint64_t base_seed = 42; + bool verify = false; + std::size_t checkpoint_size = 2 * 1024 * 1024; + + explicit GenFakeTraceArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + schema(pipeline); + } + + protected: + void register_args() override { + parser() + .add_argument("-o", "--output-dir") + .help("Output directory for trace files") + .required(); + parser() + .add_argument("-p", "--num-processes") + .help("Number of ranks") + .scan<'d', int>() + .default_value(8); + parser() + .add_argument("-H", "--num-hosts") + .help("Number of hosts") + .scan<'d', int>() + .default_value(4); + parser() + .add_argument("-e", "--num-epochs") + .help("Training epochs") + .scan<'d', int>() + .default_value(500); + parser() + .add_argument("-s", "--steps-per-epoch") + .help("Steps per epoch") + .scan<'d', int>() + .default_value(1000); + parser() + .add_argument("--checkpoint-every") + .help("Checkpoint every N epochs") + .scan<'d', int>() + .default_value(5); + parser() + .add_argument("--validation-every") + .help("Validate every N epochs") + .scan<'d', int>() + .default_value(2); + parser() + .add_argument("--num-train-files") + .help("Training data shards") + .scan<'d', int>() + .default_value(8); + parser() + .add_argument("--num-val-files") + .help("Validation data shards") + .scan<'d', int>() + .default_value(2); + parser() + .add_argument("--step-duration-ms") + .help("Base step duration in milliseconds") + .scan<'d', int>() + .default_value(100); + parser() + .add_argument("--seed") + .help("Random seed for duration jitter") + .scan<'d', std::uint64_t>() + .default_value(static_cast(42)); + parser() + .add_argument("--verify") + .help( + "After generation, build bloom indices and run queries to " + "verify chunk-skipping works") + .flag(); + parser() + .add_argument("--checkpoint-size") + .help( + "Gzip checkpoint size in bytes for indexing (default: 2 MB). " + "Smaller values produce more chunks and better demonstrate " + "chunk-level bloom filter skipping.") + .scan<'d', std::size_t>() + .default_value(static_cast(2 * 1024 * 1024)); + } + + void post_parse() override { + output_dir = parser().get("--output-dir"); + num_ranks = parser().get("--num-processes"); + num_hosts = parser().get("--num-hosts"); + num_epochs = parser().get("--num-epochs"); + steps_per_epoch = parser().get("--steps-per-epoch"); + checkpoint_every = parser().get("--checkpoint-every"); + validation_every = parser().get("--validation-every"); + num_train_files = parser().get("--num-train-files"); + num_val_files = parser().get("--num-val-files"); + step_dur_ms = parser().get("--step-duration-ms"); + base_seed = parser().get("--seed"); + verify = parser().get("--verify"); + checkpoint_size = parser().get("--checkpoint-size"); + } +}; + // --------------------------------------------------------------------------- // main // --------------------------------------------------------------------------- @@ -448,96 +586,23 @@ int main(int argc, char** argv) { "Produces per-rank .pfw.gz files with known patterns " "suitable for testing bloom-filter indexing."); - program.add_argument("-o", "--output-dir") - .help("Output directory for trace files") - .required(); - - program.add_argument("-p", "--num-processes") - .help("Number of ranks") - .scan<'d', int>() - .default_value(8); - - program.add_argument("-H", "--num-hosts") - .help("Number of hosts") - .scan<'d', int>() - .default_value(4); - - program.add_argument("-e", "--num-epochs") - .help("Training epochs") - .scan<'d', int>() - .default_value(500); - - program.add_argument("-s", "--steps-per-epoch") - .help("Steps per epoch") - .scan<'d', int>() - .default_value(1000); - - program.add_argument("--checkpoint-every") - .help("Checkpoint every N epochs") - .scan<'d', int>() - .default_value(5); - - program.add_argument("--validation-every") - .help("Validate every N epochs") - .scan<'d', int>() - .default_value(2); - - program.add_argument("--num-train-files") - .help("Training data shards") - .scan<'d', int>() - .default_value(8); - - program.add_argument("--num-val-files") - .help("Validation data shards") - .scan<'d', int>() - .default_value(2); - - program.add_argument("--step-duration-ms") - .help("Base step duration in milliseconds") - .scan<'d', int>() - .default_value(100); - - program.add_argument("--seed") - .help("Random seed for duration jitter") - .scan<'d', std::uint64_t>() - .default_value(static_cast(42)); - - program.add_argument("--verify") - .help( - "After generation, build bloom indices and run queries to " - "verify chunk-skipping works") - .flag(); - - program.add_argument("--checkpoint-size") - .help( - "Gzip checkpoint size in bytes for indexing (default: 2 MB). " - "Smaller values produce more chunks and better demonstrate " - "chunk-level bloom filter skipping.") - .scan<'d', std::size_t>() - .default_value(static_cast(2 * 1024 * 1024)); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - std::fprintf(stderr, "Error: %s\n", err.what()); - std::fprintf(stderr, "%s\n", program.help().str().c_str()); - return 1; - } - - const std::string output_dir = program.get("--output-dir"); - const int num_ranks = program.get("--num-processes"); - const int num_hosts = program.get("--num-hosts"); - const int num_epochs = program.get("--num-epochs"); - const int steps_per_epoch = program.get("--steps-per-epoch"); - const int checkpoint_every = program.get("--checkpoint-every"); - const int validation_every = program.get("--validation-every"); - const int num_train_files = program.get("--num-train-files"); - const int num_val_files = program.get("--num-val-files"); - const int step_dur_ms = program.get("--step-duration-ms"); - const std::uint64_t base_seed = program.get("--seed"); - const bool verify = program.get("--verify"); - const std::size_t checkpoint_size = - program.get("--checkpoint-size"); + GenFakeTraceArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + + const auto& output_dir = cli.output_dir; + const int num_ranks = cli.num_ranks; + const int num_hosts = cli.num_hosts; + const int num_epochs = cli.num_epochs; + const int steps_per_epoch = cli.steps_per_epoch; + const int checkpoint_every = cli.checkpoint_every; + const int validation_every = cli.validation_every; + const int num_train_files = cli.num_train_files; + const int num_val_files = cli.num_val_files; + const int step_dur_ms = cli.step_dur_ms; + const std::uint64_t base_seed = cli.base_seed; + const bool verify = cli.verify; + const std::size_t checkpoint_size = cli.checkpoint_size; // Convert base step duration to microseconds const std::uint64_t step_dur_us = @@ -607,8 +672,8 @@ int main(int argc, char** argv) { // ----------------------------------------------------------------------- // Generate one file per rank (parallel via pipeline) // ----------------------------------------------------------------------- - auto pipeline_config = PipelineConfig::default_config().with_name( - "DFTracer Fake Trace Generator"); + auto pipeline_config = cli::build_pipeline_config( + "DFTracer Fake Trace Generator", cli.pipeline); Pipeline pipeline(pipeline_config); auto* generated_files_ptr = &generated_files; @@ -632,10 +697,11 @@ int main(int argc, char** argv) { host_hashes_ptr, host_names_ptr, train_file_names_ptr, train_file_hashes_ptr, val_file_names_ptr, val_file_hashes_ptr, ckpt_file_name_ptr, ckref_ptr, script_name_ptr, sref_ptr, - rank_event_counts_ptr]([[maybe_unused]] CoroScope& ctx) - -> coro::CoroTask { + rank_event_counts_ptr]( + [[maybe_unused]] CoroScope& ctx) -> coro::CoroTask { const std::string& path = (*generated_files_ptr)[rank]; TraceWriter writer(path); + writer.write("[\n"); const std::string& sref = *sref_ptr; const std::string& ckref = *ckref_ptr; @@ -1058,60 +1124,70 @@ int main(int argc, char** argv) { } } + writer.write("]\n"); writer.close(); (*rank_event_counts_ptr)[rank] = rank_events; - co_return rank_events; + co_return; }, "Rank-" + std::to_string(rank)); rank_tasks.push_back(task); } - pipeline.set_source(rank_tasks); - pipeline.execute(); + // Summary task (prints results after generation) + auto* rank_event_counts_for_summary = &rank_event_counts; + auto* generated_files_for_summary = &generated_files; + auto summary_task = make_task( + [num_ranks, checkpoint_every, validation_every, + rank_event_counts_for_summary, generated_files_for_summary, + val_file_hashes, train_file_hashes, host_hashes, + host_names]([[maybe_unused]] CoroScope& ctx) -> coro::CoroTask { + std::size_t total_events = 0; + for (int rank = 0; rank < num_ranks; ++rank) { + std::printf(" rank %d: %zu events -> %s\n", rank, + (*rank_event_counts_for_summary)[rank], + (*generated_files_for_summary)[rank].c_str()); + total_events += (*rank_event_counts_for_summary)[rank]; + } - std::size_t total_events = 0; - for (int rank = 0; rank < num_ranks; ++rank) { - std::printf(" rank %d: %zu events -> %s\n", rank, - rank_event_counts[rank], generated_files[rank].c_str()); - total_events += rank_event_counts[rank]; + std::printf("\n==========================================\n"); + std::printf("Generation complete\n"); + std::printf("==========================================\n"); + std::printf(" Total events: %zu\n", total_events); + std::printf(" Total files: %d\n", num_ranks); + std::printf("\nInteresting queries for bloom filter testing:\n"); + std::printf( + " 1. name=pwrite (checkpoint I/O, ~%d%% of " + "epochs)\n", + checkpoint_every > 0 ? 100 / checkpoint_every : 0); + std::printf(" 2. fhash=%s (validation data, ~%d%% of epochs)\n", + val_file_hashes[0].c_str(), + validation_every > 0 ? 100 / validation_every : 0); + if (!train_file_hashes.empty()) { + std::printf(" 3. fhash=%s (rank-specific train shard)\n", + train_file_hashes[0].c_str()); + } + std::printf(" 4. hhash=%s (host-specific, %s)\n", + host_hashes[0].c_str(), host_names[0].c_str()); + std::printf( + " 5. name=allreduce (every step, dense)\n"); + std::printf( + " 6. name=fsync (checkpoint only, " + "sparse)\n"); + std::printf("==========================================\n"); + co_return; + }, + "Summary"); + + for (const auto& rt : rank_tasks) { + summary_task->depends_on(rt); } - // ----------------------------------------------------------------------- - // Summary banner - // ----------------------------------------------------------------------- - std::printf("\n==========================================\n"); - std::printf("Generation complete\n"); - std::printf("==========================================\n"); - std::printf(" Total events: %zu\n", total_events); - std::printf(" Total files: %d\n", num_ranks); - std::printf("\nInteresting queries for bloom filter testing:\n"); - std::printf( - " 1. name=pwrite (checkpoint I/O, ~%d%% of " - "epochs)\n", - checkpoint_every > 0 ? 100 / checkpoint_every : 0); - std::printf(" 2. fhash=%s (validation data, ~%d%% of epochs)\n", - val_file_hashes[0].c_str(), - validation_every > 0 ? 100 / validation_every : 0); - if (!train_file_hashes.empty()) { - std::printf(" 3. fhash=%s (rank-specific train shard)\n", - train_file_hashes[0].c_str()); - } - std::printf(" 4. hhash=%s (host-specific, %s)\n", host_hashes[0].c_str(), - host_names[0].c_str()); - std::printf(" 5. name=allreduce (every step, dense)\n"); - std::printf( - " 6. name=fsync (checkpoint only, sparse)\n"); - std::printf("==========================================\n"); + std::shared_ptr final_task = summary_task; + std::shared_ptr verify_task; - // ----------------------------------------------------------------------- - // Verify mode: build bloom indices and run queries - // ----------------------------------------------------------------------- if (verify) { std::vector test_queries; - // --- Single-dimension queries --- - - // name dimension test_queries.push_back( {"name=pwrite (sparse, ckpt only)", {{"name", {"pwrite"}}}}); test_queries.push_back( @@ -1120,24 +1196,19 @@ int main(int argc, char** argv) { {"name=fsync (sparse, ckpt only)", {{"name", {"fsync"}}}}); test_queries.push_back( {"name=val_forward (periodic)", {{"name", {"val_forward"}}}}); - - // cat dimension test_queries.push_back( {"cat=POSIX (all I/O events)", {{"cat", {"POSIX"}}}}); test_queries.push_back( {"cat=APP (all compute events)", {{"cat", {"APP"}}}}); - // pid dimension (rank-specific) std::string pid0 = std::to_string(1000); test_queries.push_back( {"pid=" + pid0 + " (rank 0 only)", {{"pid", {pid0}}}}); - // tid dimension (io thread vs main thread) std::string tid_io_0 = std::to_string(10001); test_queries.push_back( {"tid=" + tid_io_0 + " (rank 0 io thread)", {{"tid", {tid_io_0}}}}); - // fhash dimension (resolved file names) test_queries.push_back({"fhash=" + val_file_names[0] + " (resolved)", {{"fhash", {val_file_hashes[0]}}}}); if (!train_file_hashes.empty()) { @@ -1147,36 +1218,22 @@ int main(int argc, char** argv) { } test_queries.push_back( {"fhash=ckpt (resolved)", {{"fhash", {ckpt_file_hash}}}}); - - // hhash dimension (host-specific) test_queries.push_back({"hhash=" + host_names[0] + " (resolved)", {{"hhash", {host_hashes[0]}}}}); - - // sref dimension (script hash) test_queries.push_back( {"shash=train_unet3d (resolved)", {{"shash", {script_hash}}}}); - // --- Multi-dimension AND queries --- - - // name AND cat (checkpoint writes that are POSIX I/O) test_queries.push_back({"name=pwrite AND cat=POSIX", {{"name", {"pwrite"}}, {"cat", {"POSIX"}}}}); - - // name AND fhash (fsync on checkpoint file only) test_queries.push_back( {"name=fsync AND fhash=ckpt", {{"name", {"fsync"}}, {"fhash", {ckpt_file_hash}}}}); - - // cat AND hhash (POSIX I/O on node-0) test_queries.push_back( {"cat=POSIX AND hhash=" + host_names[0], {{"cat", {"POSIX"}}, {"hhash", {host_hashes[0]}}}}); - - // cat AND pid (APP events for rank 0) test_queries.push_back( {"cat=APP AND pid=" + pid0, {{"cat", {"APP"}}, {"pid", {pid0}}}}); - // name AND hhash AND fhash (read on node-0 for train shard 0) if (!train_file_hashes.empty()) { test_queries.push_back( {"name=read AND hhash=" + host_names[0] + " AND fhash=shard_0", @@ -1185,23 +1242,15 @@ int main(int argc, char** argv) { {"fhash", {train_file_hashes[0]}}}}); } - // --- OR-within dimension queries --- - - // name = pwrite OR write (all checkpoint write ops) test_queries.push_back({"name=pwrite|write (ckpt writes)", {{"name", {"pwrite", "write"}}}}); - - // name = open OR close (all open/close ops) test_queries.push_back({"name=open|close (all open/close)", {{"name", {"open", "close"}}}}); - - // fhash = any val file (all validation I/O) test_queries.push_back( {"fhash=any val file (OR)", {{"fhash", std::vector(val_file_hashes.begin(), val_file_hashes.end())}}}); - // --- Negative tests --- test_queries.push_back( {"name=NONEXISTENT (expect 0)", {{"name", {"NONEXISTENT"}}}}); test_queries.push_back( @@ -1209,7 +1258,25 @@ int main(int argc, char** argv) { test_queries.push_back({"name=pwrite AND cat=APP (impossible)", {{"name", {"pwrite"}}, {"cat", {"APP"}}}}); - return run_verify(generated_files, test_queries, checkpoint_size).get(); + auto* gf_ptr = &generated_files; + verify_task = make_task( + [gf_ptr, test_queries = std::move(test_queries), + checkpoint_size](CoroScope& ctx) -> coro::CoroTask { + co_return co_await run_verify(ctx, *gf_ptr, test_queries, + checkpoint_size); + }, + "Verify"); + + verify_task->depends_on(summary_task); + final_task = verify_task; + } + + pipeline.set_source(rank_tasks); + pipeline.set_destination(final_task); + pipeline.execute(); + + if (verify && verify_task) { + return verify_task->get(); } return 0; diff --git a/src/dftracer/utils/binaries/dftracer_index.cpp b/src/dftracer/utils/binaries/dftracer_index.cpp index 249f262a..bd02f51e 100644 --- a/src/dftracer/utils/binaries/dftracer_index.cpp +++ b/src/dftracer/utils/binaries/dftracer_index.cpp @@ -1,40 +1,122 @@ +#include #include #include -#include -#include #include #include -#include #include #include #include #include #include -#include +#include +#include -#include +#include #include #include +#include #include -#include + +#include "common_cli.h" using namespace dftracer::utils; using namespace dftracer::utils::utilities; using namespace dftracer::utils::utilities::composites::dft::indexing; using namespace dftracer::utils::utilities::indexer; -static coro::CoroTask run_index(argparse::ArgumentParser& program) { - std::string log_dir = program.get("--directory"); - std::string dimensions_str = program.get("--dimensions"); - bool force_rebuild = program.get("--force"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - std::size_t executor_threads = - program.get("--executor-threads"); - std::string index_dir = program.get("--index-dir"); - std::size_t expected_entries = - program.get("--expected-entries"); - double false_positive_rate = program.get("--false-positive-rate"); - bool build_manifest = program.get("--manifest"); +class IndexArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + + std::string dimensions; + bool manifest = false; + bool rebuild_summaries = false; + std::size_t read_batch_size = 4; + std::size_t expected_entries = 1024; + double false_positive_rate = 0.01; + + explicit IndexArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.index_dir_help = + "Directory where .dftindex stores are created"; + indexing.force_help = "Force index recreation even if already built"; + schema(directory, pipeline, indexing); + } + + protected: + void register_args() override { + parser() + .add_argument("--dimensions") + .help( + "Comma-separated extra dimensions to index from args " + "(e.g., args.level,args.mode,args.io.size)") + .default_value(""); + + parser() + .add_argument("--manifest") + .help( + "Also build manifest data in the .dftindex store " + "(per-checkpoint event line routing)") + .flag(); + + parser() + .add_argument("--rebuild-summaries") + .help( + "Rebuild ROOT_* aggregated summaries after ingest. Off by " + "default; ROOT_* CFs are only used by summary tools like " + "dftracer_info. Bloom-filter chunk-skipping queries do not " + "need them.") + .flag(); + + parser() + .add_argument("--read-batch-size") + .help("Batch read size in MB for stream processing (default: 4)") + .scan<'d', std::size_t>() + .default_value(static_cast(4)); + + parser() + .add_argument("--expected-entries") + .help( + "Expected entries per chunk for bloom filter sizing (default: " + "1024)") + .scan<'d', std::size_t>() + .default_value(static_cast(1024)); + + parser() + .add_argument("--false-positive-rate") + .help("Bloom filter false positive rate (default: 0.01)") + .scan<'g', double>() + .default_value(0.01); + } + + void post_parse() override { + dimensions = parser().get("--dimensions"); + manifest = parser().get("--manifest"); + rebuild_summaries = parser().get("--rebuild-summaries"); + read_batch_size = parser().get("--read-batch-size"); + expected_entries = parser().get("--expected-entries"); + false_positive_rate = parser().get("--false-positive-rate"); + } +}; + +static coro::CoroTask run_index(const IndexArgParse* cli) { + const auto log_dir = fs::absolute(cli->directory.value).string(); + const auto& dimensions_str = cli->dimensions; + const auto force_rebuild = cli->indexing.force; + const auto checkpoint_size = cli->indexing.checkpoint_size; + const auto executor_threads = cli->pipeline.executor_threads; + // When --index-dir is not provided, place coord/staging/ingest DBs next to + // the input data so they line up with each file's per-file index_path + // (which determine_index_path(file, "") resolves to + // /.dftindex). The top-level scanner is non-recursive, so all + // input files share log_dir. + const auto index_dir = + cli->indexing.index_dir.empty() ? log_dir : cli->indexing.index_dir; + const auto expected_entries = cli->expected_entries; + const auto false_positive_rate = cli->false_positive_rate; + const auto build_manifest = cli->manifest; + const auto rebuild_summaries = cli->rebuild_summaries; auto split_string = [](const std::string& str) { std::vector result; @@ -49,22 +131,30 @@ static coro::CoroTask run_index(argparse::ArgumentParser& program) { return result; }; - std::vector extra_dimensions = split_string(dimensions_str); + std::vector user_dimensions = split_string(dimensions_str); + + std::vector extra_dimensions( + dftracer::utils::utilities::indexer::DEFAULT_EXTRA_DIMENSIONS.begin(), + dftracer::utils::utilities::indexer::DEFAULT_EXTRA_DIMENSIONS.end()); + for (const auto& dim : user_dimensions) { + if (std::find(extra_dimensions.begin(), extra_dimensions.end(), dim) == + extra_dimensions.end()) { + extra_dimensions.push_back(dim); + } + } ChunkIndexerConfig indexer_config; indexer_config.extra_dimensions = extra_dimensions; indexer_config.expected_entries_per_chunk = expected_entries; indexer_config.false_positive_rate = false_positive_rate; - // Default bloom dimensions + any user-supplied extras. - std::vector all_dimensions = - dftracer::utils::utilities::indexer::default_bloom_dimensions(); + std::vector all_dimensions( + dftracer::utils::utilities::indexer::DEFAULT_BLOOM_DIMENSIONS.begin(), + dftracer::utils::utilities::indexer::DEFAULT_BLOOM_DIMENSIONS.end()); for (const auto& dim : extra_dimensions) { all_dimensions.push_back(dim); } - log_dir = fs::absolute(log_dir).string(); - std::printf("==========================================\n"); std::printf("DFTracer Bloom Indexer\n"); std::printf("==========================================\n"); @@ -106,98 +196,60 @@ static coro::CoroTask run_index(argparse::ArgumentParser& program) { DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size()); - auto pipeline_config = PipelineConfig() - .with_name("DFTracer Bloom Indexer") - .with_compute_threads(executor_threads) - .with_watchdog(false); + auto pipeline_config = + cli::build_pipeline_config("DFTracer Bloom Indexer", cli->pipeline); Pipeline pipeline(pipeline_config); auto start_time = std::chrono::high_resolution_clock::now(); - std::atomic total_events{0}; - std::atomic total_checkpoints_processed{0}; - std::atomic total_files_processed{0}; - std::atomic total_files_skipped{0}; + std::vector preassigned_file_ids; + { + IndexDatabase coord_db(index_dir); + coord_db.init_schema(); + preassigned_file_ids = + coord_db.register_files(input_files, build_manifest); + } + + const std::string staging_root = + (fs::path(index_dir) / ".dftindex_staging").string(); + fs::create_directories(staging_root); + + auto artifacts_queue = std::make_shared>(); + auto batch_counter = std::make_shared>(0); + + auto batch_config = std::make_shared(); + batch_config->file_paths = std::move(input_files); + batch_config->preassigned_file_ids = std::move(preassigned_file_ids); + batch_config->index_dir = index_dir; + batch_config->checkpoint_size = checkpoint_size; + batch_config->parallelism = executor_threads; + batch_config->force_rebuild = force_rebuild; + batch_config->build_manifest = build_manifest; + batch_config->bloom_config = indexer_config; + batch_config->bloom_dimensions = all_dimensions; + batch_config->rebuild_root_summaries = false; + + batch_config->sink_factory = + [staging_root, batch_counter]() -> std::unique_ptr { + const std::size_t idx = + batch_counter->fetch_add(1, std::memory_order_relaxed); + return std::make_unique( + staging_root, "batch_" + std::to_string(idx)); + }; + batch_config->sink_commit = [artifacts_queue](IndexBatchSink& sink) { + auto& sst = static_cast(sink); + auto a = sst.commit(); + if (!a.empty()) artifacts_queue->enqueue(std::move(a)); + }; + IndexBuildBatchResult batch_result; auto streaming_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask { - auto* total_events_ptr = &total_events; - auto* total_checkpoints_ptr = &total_checkpoints_processed; - auto* total_processed_ptr = &total_files_processed; - auto* total_skipped_ptr = &total_files_skipped; - auto* all_dims_ptr = &all_dimensions; - auto* files_ptr = &input_files; - auto* index_dir_ptr = &index_dir; - // Bounded fan-out: channel limits concurrent file processing - // to avoid memory pressure from unbounded coroutine spawning. - auto file_chan = - coro::make_channel(executor_threads * 2); - - // Producer: push file indices into channel - scope.spawn([ch = file_chan->producer(), - num_files = input_files.size()]( - CoroScope&) mutable -> coro::CoroTask { - auto guard = ch.guard(); - for (std::size_t i = 0; i < num_files; ++i) { - if (!co_await ch.send(i)) { - co_return; - } - } - co_return; - }); - - // Workers: consume from channel, process one file at a time - for (std::size_t w = 0; w < executor_threads; ++w) { - scope.spawn([file_chan, files_ptr, indexer_config, - build_manifest, index_dir_ptr, checkpoint_size, - force_rebuild, all_dims_ptr, total_events_ptr, - total_checkpoints_ptr, total_processed_ptr, - total_skipped_ptr]( - CoroScope&) -> coro::CoroTask { - while (auto fi_opt = co_await file_chan->receive()) { - std::size_t fi = *fi_opt; - const auto& file_path = (*files_ptr)[fi]; - - IndexBuilderUtility builder; - auto config = - IndexBuildConfig::for_file(file_path) - .with_index_dir(*index_dir_ptr) - .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(force_rebuild) - .with_bloom(true) - .with_manifest(build_manifest) - .with_index_threshold(0) - .with_bloom_config(indexer_config) - .with_bloom_dimensions(*all_dims_ptr); - - auto result = co_await builder.process(config); - - if (result.was_skipped) { - (*total_skipped_ptr)++; - } else if (result.success) { - (*total_processed_ptr)++; - (*total_events_ptr) += result.events_processed; - (*total_checkpoints_ptr) += - result.chunks_processed; - } else { - (*total_skipped_ptr)++; - if (!result.error_message.empty()) { - DFTRACER_UTILS_LOG_ERROR( - "Index failed for %s: %s", - file_path.c_str(), - result.error_message.c_str()); - } - } - } - co_return; - }); - } - co_return; - }); - - co_return; + [&batch_result, + batch_config](CoroScope& scope) -> coro::CoroTask { + batch_result = co_await IndexBatchBuilderUtility::process( + &scope, std::move(batch_config)); }, "StreamingIndex"); @@ -205,6 +257,38 @@ static coro::CoroTask run_index(argparse::ArgumentParser& program) { pipeline.set_destination(streaming_task); pipeline.execute(); + SstArtifactRegistry registry; + { + IndexDatabaseSstWriterContext::Artifacts a; + while (artifacts_queue->try_dequeue(a)) { + registry.append(std::move(a)); + } + } + DFTRACER_UTILS_LOG_INFO( + "dftracer_index: %zu SSTs in registry (chunk_bloom=%zu file_bloom=%zu)", + registry.chunk_bloom().size() + registry.file_bloom().size() + + registry.chunk_stats().size(), + registry.chunk_bloom().size(), registry.file_bloom().size()); + { + IndexDatabase ingest_db(index_dir); + auto t0 = std::chrono::high_resolution_clock::now(); + ingest_db.bulk_ingest(registry, {}); + auto t1 = std::chrono::high_resolution_clock::now(); + if (rebuild_summaries) { + ingest_db.rebuild_root_summaries(); + } + auto t2 = std::chrono::high_resolution_clock::now(); + DFTRACER_UTILS_LOG_INFO( + "dftracer_index: bulk_ingest=%.2fms " + "rebuild_root_summaries=%.2fms%s", + std::chrono::duration(t1 - t0).count(), + std::chrono::duration(t2 - t1).count(), + rebuild_summaries ? "" : " (skipped)"); + } + + std::error_code ec; + fs::remove_all(staging_root, ec); + auto end_time = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end_time - start_time; @@ -213,11 +297,11 @@ static coro::CoroTask run_index(argparse::ArgumentParser& program) { std::printf("Bloom Index Results\n"); std::printf("==========================================\n"); std::printf(" Execution time: %.2f seconds\n", duration.count() / 1000.0); - std::printf(" Files processed: %zu\n", total_files_processed.load()); - std::printf(" Files skipped: %zu\n", total_files_skipped.load()); - std::printf(" Checkpoints indexed: %zu\n", - total_checkpoints_processed.load()); - std::printf(" Events processed: %zu\n", total_events.load()); + std::printf(" Files processed: %zu\n", batch_result.indexed); + std::printf(" Files skipped: %zu\n", batch_result.skipped); + std::printf(" Files failed: %zu\n", batch_result.failed); + std::printf(" Events processed: %zu\n", + static_cast(batch_result.total_events)); std::printf(" Dimensions indexed: %zu\n", all_dimensions.size()); std::printf(" Dimensions: "); for (std::size_t i = 0; i < all_dimensions.size(); ++i) { @@ -235,13 +319,6 @@ static coro::CoroTask run_index(argparse::ArgumentParser& program) { int main(int argc, char** argv) { DFTRACER_UTILS_LOGGER_INIT(); - auto default_checkpoint_size_str = - std::to_string(indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE) + - " B (" + - std::to_string(indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE / - (1024 * 1024)) + - " MB)"; - argparse::ArgumentParser program("dftracer_index", DFTRACER_UTILS_PACKAGE_VERSION); program.add_description( @@ -249,67 +326,9 @@ int main(int argc, char** argv) { "Creates root-local .dftindex databases enabling fast chunk-skipping " "queries."); - program.add_argument("-d", "--directory") - .help("Input directory containing .pfw or .pfw.gz files") - .default_value("."); - - program.add_argument("--dimensions") - .help( - "Comma-separated extra dimensions to index from args " - "(e.g., args.level,args.mode,args.io.size)") - .default_value(""); - - program.add_argument("-f", "--force") - .help("Force index recreation even if already built") - .flag(); - - program.add_argument("--checkpoint-size") - .help("Checkpoint size for gzip indexing in bytes (default: " + - default_checkpoint_size_str + ")") - .scan<'d', std::size_t>() - .default_value(static_cast( - indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE)); - - program.add_argument("--executor-threads") - .help("Number of worker threads for parallel processing") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - program.add_argument("--index-dir") - .help("Directory where .dftindex stores are created") - .default_value(""); - - program.add_argument("--expected-entries") - .help( - "Expected entries per chunk for bloom filter sizing (default: " - "1024)") - .scan<'d', std::size_t>() - .default_value(static_cast(1024)); - - program.add_argument("--false-positive-rate") - .help("Bloom filter false positive rate (default: 0.01)") - .scan<'g', double>() - .default_value(0.01); - - program.add_argument("--read-batch-size") - .help("Batch read size in MB for stream processing (default: 4)") - .scan<'d', std::size_t>() - .default_value(static_cast(4)); - - program.add_argument("--manifest") - .help( - "Also build manifest data in the .dftindex store " - "(per-checkpoint event line routing)") - .flag(); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what()); - std::fprintf(stderr, "%s\n", program.help().str().c_str()); - return 1; - } + IndexArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; - return run_index(program).get(); + return run_index(&cli).get(); } diff --git a/src/dftracer/utils/binaries/dftracer_info.cpp b/src/dftracer/utils/binaries/dftracer_info.cpp index 7c4a7191..221098ed 100644 --- a/src/dftracer/utils/binaries/dftracer_info.cpp +++ b/src/dftracer/utils/binaries/dftracer_info.cpp @@ -1,156 +1,226 @@ #include #include -#include #include -#include #include #include -#include +#include #include #include #include +#include +#include #include #include -#include +#include #include #include +#include +#include #include #include -#include -#include -#include #include -#include +#include #include -#include + +#include "common_cli.h" using namespace dftracer::utils; using namespace dftracer::utils::utilities::composites::dft; -using dftracer::utils::utilities::indexer::IndexBuildConfig; -using dftracer::utils::utilities::indexer::IndexBuilderUtility; +using namespace dftracer::utils::utilities::composites::dft::indexing; +using dftracer::utils::utilities::indexer::FileRegistryEntry; +using dftracer::utils::utilities::indexer::has_capability; +using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBuildBatchConfig; using dftracer::utils::utilities::indexer::IndexDatabase; -using dftracer::utils::utilities::indexer::internal::Indexer; +using dftracer::utils::utilities::indexer::IndexFileEntryCapability; using dftracer::utils::utilities::indexer::internal::IndexerFactory; +class InfoArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{cli::DirMode::DEFAULT_EMPTY}; + cli::FilesArgs files_args{"Compressed files to inspect (GZIP, TAR.GZ)"}; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + + std::string query_type = "summary"; + bool verbose = false; + bool force_rebuild = false; + + explicit InfoArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.with_force = false; + indexing.index_dir_help = + "Directory to store index files (default: system temp directory)"; + schema(directory, files_args, pipeline, indexing); + } + + protected: + void register_args() override { + parser() + .add_argument("--query") + .help( + "Query type: summary (aggregate all files, default) or " + "detailed (per-file output)") + .default_value("summary"); + + parser() + .add_argument("-v", "--verbose") + .help("Show detailed information including index details") + .flag(); + + parser() + .add_argument("-f", "--force-rebuild") + .help("Force rebuild index files") + .flag(); + } + + void post_parse() override { + query_type = parser().get("--query"); + verbose = parser().get("--verbose"); + force_rebuild = parser().get("--force-rebuild"); + } +}; + static std::string format_size(std::uint64_t bytes) { const char* units[] = {"B", "KB", "MB", "GB", "TB"}; int unit_index = 0; double size = static_cast(bytes); - while (size >= 1024.0 && unit_index < 4) { size /= 1024.0; unit_index++; } - std::ostringstream oss; oss << std::fixed << std::setprecision(2) << size << " " << units[unit_index]; return oss.str(); } -/// Fast path: read metadata from the `.dftindex` database. -/// Returns success=false if index doesn't exist, letting the caller -/// fall back to direct_scan_info for small/unindexed files. -static MetadataCollectorUtilityOutput index_based_info( - const std::string& file_path) { - using utilities::indexer::IndexDatabase; +using FileRegistry = std::unordered_map; - MetadataCollectorUtilityOutput meta; - meta.file_path = file_path; +struct RootInfoSummary { + std::size_t file_count = 0; + std::uint64_t total_events = 0; + std::uint64_t total_lines = 0; + std::uint64_t total_uncompressed = 0; +}; - try { - std::string index_path = file_path + constants::indexer::EXTENSION; - if (!fs::exists(index_path)) { - meta.success = false; - return meta; - } +static coro::CoroTask> load_root_info_summary( + std::string index_path) { + auto result = std::make_shared(); + std::optional + root; + { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + root = db.query_root_scalar_stats(); + } + + if (!root) { + DFTRACER_UTILS_LOG_INFO( + "Root scalar stats missing for %s; rebuilding from file " + "registry", + index_path.c_str()); IndexDatabase db(index_path); - int fid = db.find_file(file_path); - if (fid < 0) { - meta.success = false; - return meta; - } + auto writer = db.begin_write(); + writer->rebuild_root_summaries(); + writer->commit(); + root = db.query_root_scalar_stats(); + } - meta.format = IndexerFactory::detect_format(file_path); - meta.compressed_size = fs::file_size(file_path); - meta.num_lines = db.get_num_lines(fid); - meta.uncompressed_size = db.get_max_bytes(fid); - meta.valid_events = db.get_total_events(fid); + if (root) { + result->file_count = root->num_files; + result->total_events = root->stats.total_events; + result->total_lines = root->total_lines; + result->total_uncompressed = root->total_uncompressed_bytes; + } + co_return result; +} + +static coro::CoroTask> load_file_registry( + std::string index_path) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + co_return std::make_shared(db.query_all_file_registry()); +} + +static std::vector +process_index_group_info_sync(std::string index_path, + std::vector entries) { + std::vector file_ids; + file_ids.reserve(entries.size()); + for (const auto& entry : entries) { + file_ids.push_back(entry.file_id); + } + + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto metadata_rows = db.query_file_metadata_batch(file_ids); + auto merged_stats = db.query_merged_statistics_batch(file_ids); + + std::vector results; + results.reserve(entries.size()); + + for (const auto& entry : entries) { + MetadataCollectorUtilityOutput meta; + meta.file_path = entry.file_path; meta.index_path = index_path; meta.has_index = true; meta.index_valid = true; + + auto metadata_it = metadata_rows.find(entry.file_id); + if (metadata_it == metadata_rows.end()) { + meta.success = false; + meta.error_message = "No file metadata found in shared index"; + results.push_back(std::move(meta)); + continue; + } + + meta.format = IndexerFactory::detect_format(entry.file_path); + meta.compressed_size = 0; + meta.checkpoint_size = metadata_it->second.checkpoint_size; + meta.num_lines = metadata_it->second.num_lines; + meta.uncompressed_size = metadata_it->second.max_bytes; meta.size_mb = - static_cast(meta.compressed_size) / (1024.0 * 1024.0); + static_cast(meta.uncompressed_size) / (1024.0 * 1024.0); meta.start_line = 1; meta.end_line = meta.num_lines; - meta.size_per_line = - (meta.valid_events > 0) - ? meta.size_mb / static_cast(meta.valid_events) - : 0; - meta.success = true; - } catch (...) { - meta.success = false; - } - return meta; -} + if (meta.checkpoint_size > 0 && meta.uncompressed_size > 0) { + meta.num_checkpoints = + (meta.uncompressed_size + meta.checkpoint_size - 1) / + meta.checkpoint_size; + } -/// One streaming decompress pass, count lines with JSON validation, -/// without creating a `.dftindex` store. -static coro::CoroTask direct_scan_info( - std::string file_path) { - using dftracer::utils::utilities::fileio::lines::sources:: - async_streaming_gz_lines; - - MetadataCollectorUtilityOutput meta; - meta.file_path = file_path; - meta.has_index = false; - meta.index_valid = false; - - try { - meta.format = dftracer::utils::utilities::indexer::internal:: - IndexerFactory::detect_format(file_path); - meta.compressed_size = fs::file_size(file_path); - - std::size_t total_lines = 0; - std::size_t valid_events = 0; - std::uint64_t total_bytes = 0; - - auto gen = async_streaming_gz_lines(file_path); - while (auto line_opt = co_await gen.next()) { - total_lines++; - const auto& line = *line_opt; - total_bytes += line.content.length(); - const char* trimmed; - std::size_t trimmed_length; - if (json_trim_and_validate(line.content.data(), - line.content.length(), trimmed, - trimmed_length) && - trimmed_length > 8) { - valid_events++; + auto stats_it = merged_stats.find(entry.file_id); + if (stats_it != merged_stats.end()) { + meta.valid_events = stats_it->second.stats.total_events; + if (stats_it->second.num_chunks > 0) { + meta.num_checkpoints = stats_it->second.num_chunks; } + } else { + meta.valid_events = meta.num_lines; } - meta.num_lines = total_lines; - meta.valid_events = valid_events; - meta.uncompressed_size = total_bytes; - meta.size_mb = - static_cast(meta.compressed_size) / (1024.0 * 1024.0); - meta.start_line = 1; - meta.end_line = total_lines; meta.size_per_line = - (valid_events > 0) - ? meta.size_mb / static_cast(valid_events) - : 0; + (meta.valid_events > 0) + ? meta.size_mb / static_cast(meta.valid_events) + : 0.0; meta.success = true; - } catch (const std::exception& e) { - meta.error_message = e.what(); - meta.success = false; + results.push_back(std::move(meta)); } - co_return meta; + return results; +} + +static coro::CoroTask> +process_index_group_info(std::shared_ptr index_path, + std::shared_ptr> entries) { + co_return process_index_group_info_sync(std::move(*index_path), + std::move(*entries)); } static void print_file_info(const MetadataCollectorUtilityOutput& info, @@ -165,16 +235,16 @@ static void print_file_info(const MetadataCollectorUtilityOutput& info, return; } - // Basic Information std::printf("Basic Information:\n"); std::printf(" Format: %s\n", get_format_name(info.format)); std::printf(" Status: %s\n", "OK"); - // File Size Information std::printf("\nFile Size:\n"); - std::printf(" Compressed: %12s (%llu bytes)\n", - format_size(info.compressed_size).c_str(), - (unsigned long long)info.compressed_size); + if (info.compressed_size > 0) { + std::printf(" Compressed: %12s (%llu bytes)\n", + format_size(info.compressed_size).c_str(), + (unsigned long long)info.compressed_size); + } std::printf(" Uncompressed: %12s (%llu bytes)\n", format_size(info.uncompressed_size).c_str(), (unsigned long long)info.uncompressed_size); @@ -184,78 +254,21 @@ static void print_file_info(const MetadataCollectorUtilityOutput& info, double ratio = 100.0 * (1.0 - static_cast(info.compressed_size) / static_cast(info.uncompressed_size)); - double compression_factor = - static_cast(info.uncompressed_size) / - static_cast(info.compressed_size); - std::printf( - " Savings: %12s (%.2f%% reduction)\n", - format_size(info.uncompressed_size - info.compressed_size).c_str(), - ratio); - std::printf(" Ratio: %.2fx compression\n", compression_factor); + std::printf(" Savings: %.2f%% reduction\n", ratio); } - // Content Information std::printf("\nContent:\n"); std::printf(" Total Lines: %llu\n", (unsigned long long)info.num_lines); std::printf(" Valid Events: %zu\n", info.valid_events); - if (info.num_lines > 0) { - std::printf(" Avg Bytes/Line: %.2f bytes\n", - static_cast(info.uncompressed_size) / - static_cast(info.num_lines)); - } - - if (info.valid_events > 0) { - std::printf(" Avg Bytes/Event: %.2f bytes\n", - static_cast(info.uncompressed_size) / - static_cast(info.valid_events)); - } - - // Index Information (always show if index-capable format) - if (info.format == ArchiveFormat::GZIP || - info.format == ArchiveFormat::TAR_GZ) { + if (info.has_index && info.index_valid) { std::printf("\nIndex Information:\n"); - std::printf(" Index Store: %s\n", info.index_path.empty() - ? "(auto-generated)" - : info.index_path.c_str()); - std::printf(" Index Status: %s\n", - info.has_index ? (info.index_valid ? "Valid" : "Invalid") - : "Not Created"); - - if (info.has_index && info.index_valid) { - std::printf(" Checkpoint Size: %s (%llu bytes)\n", - format_size(info.checkpoint_size).c_str(), - (unsigned long long)info.checkpoint_size); - std::printf(" Number of Checkpoints: %zu\n", info.num_checkpoints); - - if (info.num_checkpoints > 0) { - std::uint64_t avg_chunk = - info.uncompressed_size / info.num_checkpoints; - std::uint64_t lines_per_checkpoint = - info.num_lines / info.num_checkpoints; - std::printf(" Avg Chunk Size: %s (%llu bytes)\n", - format_size(avg_chunk).c_str(), - (unsigned long long)avg_chunk); - std::printf(" Avg Lines/Checkpoint: %llu\n", - (unsigned long long)lines_per_checkpoint); - - // Calculate index overhead - if (fs::exists(info.index_path)) { - std::uint64_t index_size = fs::file_size(info.index_path); - double index_overhead = - 100.0 * static_cast(index_size) / - static_cast(info.compressed_size); - std::printf(" Index File Size: %s (%llu bytes)\n", - format_size(index_size).c_str(), - (unsigned long long)index_size); - std::printf(" Index Overhead: %.2f%% of compressed size\n", - index_overhead); - } - } - } + std::printf(" Index Store: %s\n", info.index_path.c_str()); + std::printf(" Checkpoint Size: %s\n", + format_size(info.checkpoint_size).c_str()); + std::printf(" Checkpoints: %zu\n", info.num_checkpoints); } - // Detailed Statistics (verbose mode) if (verbose) { std::printf("\nDetailed Statistics:\n"); std::printf(" Start Line: %zu\n", info.start_line); @@ -263,283 +276,279 @@ static void print_file_info(const MetadataCollectorUtilityOutput& info, std::printf(" Size (MB): %.6f\n", info.size_mb); std::printf(" MB per Event: %.8f\n", info.size_per_line); - // Performance estimates if (info.num_checkpoints > 0 && info.num_lines > 0) { - std::uint64_t lines_per_checkpoint = - info.num_lines / info.num_checkpoints; + auto lines_per_ckpt = info.num_lines / info.num_checkpoints; std::printf("\nRandom Access Performance:\n"); std::printf(" Worst-case lines to scan: %llu (1 checkpoint)\n", - (unsigned long long)lines_per_checkpoint); - std::printf( - " Best-case lines to scan: 1 (exact checkpoint hit)\n"); + (unsigned long long)lines_per_ckpt); std::printf(" Avg lines to scan: %llu (0.5 checkpoint)\n", - (unsigned long long)(lines_per_checkpoint / 2)); - } - - // Memory estimates - if (info.checkpoint_size > 0) { - std::printf("\nMemory Estimates:\n"); - std::printf(" Memory for 1 checkpoint: ~%s\n", - format_size(info.checkpoint_size).c_str()); - if (info.num_checkpoints > 0) { - std::uint64_t total_memory_for_all = - info.checkpoint_size * info.num_checkpoints; - std::printf(" Memory for all checkpoints: ~%s\n", - format_size(total_memory_for_all).c_str()); - } + (unsigned long long)(lines_per_ckpt / 2)); } } std::printf("\n"); } -int main(int argc, char** argv) { - DFTRACER_UTILS_LOGGER_INIT(); +static coro::CoroTask auto_index_and_resolve( + CoroScope& ctx, std::vector& files_needing_index, + const std::string& index_dir, std::size_t checkpoint_size, + std::size_t executor_threads, + std::unordered_map>& + indexed_groups) { + auto index_path = internal::determine_index_path( + files_needing_index.front().file_path, index_dir); + dftracer::utils::rocksdb::RocksDBManager::instance().reset(index_path); + + const bool all_gzip = + std::all_of(files_needing_index.begin(), files_needing_index.end(), + [](const FileWorkItem& item) { + return item.file_path.ends_with(".gz"); + }); - auto default_checkpoint_size_str = - std::to_string(Indexer::DEFAULT_CHECKPOINT_SIZE) + " B (" + - std::to_string(Indexer::DEFAULT_CHECKPOINT_SIZE / (1024 * 1024)) + - " MB)"; + { + auto batch_config = std::make_shared(); + batch_config->file_paths.reserve(files_needing_index.size()); + for (const auto& item : files_needing_index) { + batch_config->file_paths.push_back(item.file_path); + } + batch_config->index_dir = index_dir; + batch_config->checkpoint_size = checkpoint_size; + batch_config->parallelism = executor_threads; + batch_config->use_batch_write = all_gzip; + batch_config->rebuild_root_summaries = all_gzip; + + auto batch_result = co_await IndexBatchBuilderUtility::process( + &ctx, std::move(batch_config)); + + for (const auto& result : batch_result.results) { + if (!result.success && !result.error_message.empty()) { + DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s", + result.file_path.c_str(), + result.error_message.c_str()); + } + } + } - argparse::ArgumentParser program("dftracer_info", - DFTRACER_UTILS_PACKAGE_VERSION); - program.add_description( - "Display metadata and index information for DFTracer compressed files " - "using composable utilities and pipeline processing"); + std::vector newly_indexed; + newly_indexed.reserve(files_needing_index.size()); + for (const auto& item : files_needing_index) { + newly_indexed.push_back(item.file_path); + } - program.add_argument("--files") - .help("Compressed files to inspect (GZIP, TAR.GZ)") - .nargs(argparse::nargs_pattern::any) - .default_value>({}); - - program.add_argument("-d", "--directory") - .help("Directory containing files to inspect") - .default_value(""); - - program.add_argument("--query") - .help( - "Query type: summary (aggregate all files, default) or " - "detailed (per-file output)") - .default_value("summary"); - - program.add_argument("-v", "--verbose") - .help("Show detailed information including index details") - .flag(); - - program.add_argument("-f", "--force-rebuild") - .help("Force rebuild index files") - .flag(); - - program.add_argument("-c", "--checkpoint-size") - .help("Checkpoint size for indexing in bytes (default: " + - default_checkpoint_size_str + ")") - .scan<'d', std::size_t>() - .default_value( - static_cast(Indexer::DEFAULT_CHECKPOINT_SIZE)); - - program.add_argument("--index-dir") - .help("Directory to store index files (default: system temp directory)") - .default_value(""); - - program.add_argument("--executor-threads") - .help( - "Number of executor threads for parallel processing (default: " - "number of CPU cores)") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what()); - std::cerr << program; - return 1; + IndexResolverUtility resolver; + ResolverInput refresh_input; + refresh_input.files = std::move(newly_indexed); + refresh_input.index_dir = index_dir; + refresh_input.require_checkpoints = true; + + auto refresh_result = co_await resolver.process(refresh_input); + + if (!refresh_result.cached.empty()) { + indexed_groups[refresh_result.index_path] = + std::move(refresh_result.cached); } +} - // Parse arguments - std::string directory = program.get("--directory"); - std::string query_type = program.get("--query"); - bool verbose = program.get("--verbose"); - bool force_rebuild = program.get("--force-rebuild"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - std::string index_dir = program.get("--index-dir"); - std::size_t executor_threads = - program.get("--executor-threads"); +static coro::CoroTask run_info(CoroScope& ctx, const InfoArgParse* cli) { + const auto& directory = cli->directory.value; + const auto& query_type = cli->query_type; + const auto verbose = cli->verbose; + const auto force_rebuild = cli->force_rebuild; + const auto checkpoint_size = cli->indexing.checkpoint_size; + const auto& index_dir = cli->indexing.index_dir; + const auto executor_threads = cli->pipeline.executor_threads; + const bool summary_mode = (query_type != "detailed"); - bool summary_mode = (query_type != "detailed"); + Timer stages_storage("dftracer_info"); + Timer* stages = cli->pipeline.time_profiling ? &stages_storage : nullptr; + Timer overall(true); - // Collect files to process std::vector files; - if (!directory.empty()) { - if (!fs::exists(directory)) { - DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s", - directory.c_str()); - return 1; - } + std::vector files_needing_index; + std::unordered_map> indexed_groups; - for (const auto& entry : fs::directory_iterator(directory)) { - if (entry.is_regular_file()) { - std::string path = entry.path().string(); - std::string ext = entry.path().extension().string(); - if (ext == ".gz") { - files.push_back(path); - } + { + ScopedTimer _t(stages, "collect_and_classify"); + + if (!directory.empty()) { + if (!fs::exists(directory)) { + DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s", + directory.c_str()); + co_return 1; } - } - if (files.empty()) { - DFTRACER_UTILS_LOG_ERROR( - "No compressed files found in directory: %s", - directory.c_str()); - return 1; - } - } else { - files = program.get>("--files"); - - if (files.empty()) { - DFTRACER_UTILS_LOG_ERROR( - "%s", "No files or directory specified. Use --help for usage."); - std::cerr << program; - return 1; + auto trusted_index_path = + internal::determine_index_path(directory, index_dir); + if (!force_rebuild && fs::exists(trusted_index_path)) { + if (summary_mode) { + ScopedTimer _rt(stages, "root_summary_read"); + auto root_result = + co_await load_root_info_summary(trusted_index_path); + + if (stages) stages->print_stages(); + + std::printf("==========================================\n"); + std::printf("DFTracer File Info Summary\n"); + std::printf("==========================================\n"); + std::printf(" Total Files: %zu\n", + root_result->file_count); + std::printf(" Successful: %zu\n", + root_result->file_count); + std::printf(" Failed: 0\n"); + std::printf(" Total Lines: %llu\n", + (unsigned long long)root_result->total_lines); + std::printf(" Valid Events: %llu\n", + (unsigned long long)root_result->total_events); + std::printf( + " Total Uncompressed: %s (%llu bytes)\n", + format_size(root_result->total_uncompressed).c_str(), + (unsigned long long)root_result->total_uncompressed); + if (root_result->total_events > 0) { + std::printf( + " Avg Bytes/Event: %.2f bytes\n", + static_cast( + root_result->total_uncompressed) / + static_cast(root_result->total_events)); + } + std::printf(" Processing Time: %.2f ms\n", + static_cast(overall.elapsed()) / 1e6); + std::printf("==========================================\n"); + co_return 0; + } + + { + ScopedTimer _lr(stages, "load_registry"); + auto registry_ptr = + co_await load_file_registry(trusted_index_path); + + files.reserve(registry_ptr->size()); + auto& group = indexed_groups[trusted_index_path]; + group.reserve(registry_ptr->size()); + std::size_t fi = 0; + for (auto& [logical_path, reg] : *registry_ptr) { + files.push_back(logical_path); + if (has_capability( + reg.capabilities, + IndexFileEntryCapability::FILE_SUMMARY)) { + group.push_back(ResolvedFile{fi, logical_path, + reg.file_id, + reg.capabilities}); + } + ++fi; + } + } + } else { + ScopedTimer _ds(stages, "scan_and_resolve"); + IndexResolverUtility resolver; + auto input = std::make_unique(); + input->directory = directory; + input->index_dir = index_dir; + auto result = co_await resolver.process(*input); + files = std::move(result.all_files); + if (!result.cached.empty()) { + indexed_groups[result.index_path] = + std::move(result.cached); + } + files_needing_index = std::move(result.needs_checkpoint); + } + } else { + ScopedTimer _rs(stages, "resolve_index_state"); + IndexResolverUtility resolver; + auto input = std::make_unique(); + input->files = cli->files_args.value; + input->index_dir = index_dir; + auto result = co_await resolver.process(*input); + files = std::move(result.all_files); + if (!result.cached.empty()) { + indexed_groups[result.index_path] = std::move(result.cached); + } + files_needing_index = std::move(result.needs_checkpoint); } } - // Small files skip indexing to avoid creating `.dftindex` stores on - // metadata-sensitive filesystems (e.g. Lustre). - static constexpr std::size_t INDEX_SIZE_THRESHOLD = - constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; - std::unordered_set small_files; - for (const auto& file_path : files) { - std::error_code ec; - auto fsize = fs::file_size(file_path, ec); - if (!ec && fsize > 0 && fsize < INDEX_SIZE_THRESHOLD) { - small_files.insert(file_path); - } + if (files.empty()) { + DFTRACER_UTILS_LOG_ERROR("%s", "No files found. Use --help for usage."); + co_return 1; } - auto start_time = std::chrono::high_resolution_clock::now(); + std::vector all_results; + std::mutex results_mutex; - if (summary_mode) { - // Summary: accumulate totals in workers, print once at the end. - // No per-file storage, no sort, no per-file print. - std::atomic total_compressed{0}; - std::atomic total_uncompressed{0}; - std::atomic total_lines{0}; - std::atomic total_valid_events{0}; - std::atomic successful{0}; - std::atomic failed{0}; - - { - auto pipeline_config = PipelineConfig() - .with_name("DFTracer File Info") - .with_compute_threads(executor_threads) - .with_watchdog(false); - - Pipeline pipeline(pipeline_config); - - auto info_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - co_await ctx.scope([&](CoroScope& scope) - -> coro::CoroTask { - auto* files_ptr = &files; - auto* total_compressed_ptr = &total_compressed; - auto* total_uncompressed_ptr = &total_uncompressed; - auto* total_lines_ptr = &total_lines; - auto* total_valid_events_ptr = &total_valid_events; - auto* successful_ptr = &successful; - auto* failed_ptr = &failed; - - auto file_chan = coro::make_channel( - executor_threads * 2); - - scope.spawn( - [ch = file_chan->producer(), files_ptr]( - CoroScope&) mutable -> coro::CoroTask { - auto guard = ch.guard(); - for (std::size_t i = 0; i < files_ptr->size(); - ++i) { - if (!co_await ch.send(i)) { - co_return; - } - } - co_return; - }); - - for (std::size_t w = 0; w < executor_threads; ++w) { - scope.spawn( - [file_chan, files_ptr, total_compressed_ptr, - total_uncompressed_ptr, total_lines_ptr, - total_valid_events_ptr, successful_ptr, - failed_ptr]( - CoroScope&) -> coro::CoroTask { - while (auto fi_opt = - co_await file_chan->receive()) { - std::size_t fi = *fi_opt; - const auto& fp = (*files_ptr)[fi]; - - // Phase 1: build index if - // needed (skips small files - // and already-indexed files) - IndexBuilderUtility builder; - auto build_config = - IndexBuildConfig::for_file(fp) - .with_force_rebuild(false); - co_await builder.process(build_config); - - // Phase 2: read from index, - // fall back to direct scan - // for small/unindexed files - auto info = index_based_info(fp); - if (!info.success) { - info = - co_await direct_scan_info(fp); - } - - if (info.success) { - total_compressed_ptr->fetch_add( - info.compressed_size, - std::memory_order_relaxed); - total_uncompressed_ptr->fetch_add( - info.uncompressed_size, - std::memory_order_relaxed); - total_lines_ptr->fetch_add( - info.num_lines, - std::memory_order_relaxed); - total_valid_events_ptr->fetch_add( - info.valid_events, - std::memory_order_relaxed); - successful_ptr->fetch_add( - 1, std::memory_order_relaxed); - } else { - failed_ptr->fetch_add( - 1, std::memory_order_relaxed); - } - } - co_return; - }); + if (!indexed_groups.empty()) { + ScopedTimer _t(stages, "index_batch_read"); + co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask { + auto* all_results_ptr = &all_results; + auto* mutex_ptr = &results_mutex; + for (auto& [ip, group] : indexed_groups) { + auto idx_path_ptr = std::make_shared(ip); + auto entries_ptr = std::make_shared>( + std::move(group)); + scope.spawn( + [idx_path_ptr, entries_ptr, all_results_ptr, + mutex_ptr](CoroScope&) mutable -> coro::CoroTask { + auto infos = co_await process_index_group_info( + std::move(idx_path_ptr), std::move(entries_ptr)); + std::lock_guard lock(*mutex_ptr); + for (auto& info : infos) { + all_results_ptr->push_back(std::move(info)); } - co_return; }); - co_return; - }, - "CollectInfo"); + } + co_return; + }); + } - pipeline.set_source(info_task); - pipeline.set_destination(info_task); - pipeline.execute(); + if (!files_needing_index.empty()) { + ScopedTimer _t(stages, "auto_index_and_build"); + co_await auto_index_and_resolve(ctx, files_needing_index, index_dir, + checkpoint_size, executor_threads, + indexed_groups); + + if (!indexed_groups.empty()) { + ScopedTimer _t2(stages, "newly_indexed_batch_read"); + co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask { + auto* all_results_ptr = &all_results; + auto* mutex_ptr = &results_mutex; + for (auto& [ip, group] : indexed_groups) { + auto idx_path_ptr = std::make_shared(ip); + auto entries_ptr = + std::make_shared>( + std::move(group)); + scope.spawn( + [idx_path_ptr, entries_ptr, all_results_ptr, mutex_ptr]( + CoroScope&) mutable -> coro::CoroTask { + auto infos = co_await process_index_group_info( + std::move(idx_path_ptr), + std::move(entries_ptr)); + std::lock_guard lock(*mutex_ptr); + for (auto& info : infos) { + all_results_ptr->push_back(std::move(info)); + } + }); + } + co_return; + }); } + } - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = - end_time - start_time; - - auto tc = total_compressed.load(); - auto tu = total_uncompressed.load(); - auto tl = total_lines.load(); - auto tv = total_valid_events.load(); - auto ok = successful.load(); - auto bad = failed.load(); + if (stages) stages->print_stages(); + if (summary_mode) { + std::uint64_t total_uncompressed = 0; + std::uint64_t total_lines = 0; + std::uint64_t total_events = 0; + std::size_t ok = 0; + std::size_t bad = 0; + + for (const auto& r : all_results) { + if (r.success) { + total_uncompressed += r.uncompressed_size; + total_lines += r.num_lines; + total_events += r.valid_events; + ok++; + } else { + bad++; + } + } std::printf("==========================================\n"); std::printf("DFTracer File Info Summary\n"); @@ -547,174 +556,85 @@ int main(int argc, char** argv) { std::printf(" Total Files: %zu\n", files.size()); std::printf(" Successful: %zu\n", ok); std::printf(" Failed: %zu\n", bad); - std::printf(" Total Lines: %llu\n", (unsigned long long)tl); - std::printf(" Valid Events: %llu\n", (unsigned long long)tv); - std::printf(" Total Compressed: %s (%llu bytes)\n", - format_size(tc).c_str(), (unsigned long long)tc); + std::printf(" Total Lines: %llu\n", + (unsigned long long)total_lines); + std::printf(" Valid Events: %llu\n", + (unsigned long long)total_events); std::printf(" Total Uncompressed: %s (%llu bytes)\n", - format_size(tu).c_str(), (unsigned long long)tu); + format_size(total_uncompressed).c_str(), + (unsigned long long)total_uncompressed); - if (tc > 0 && tu > 0 && tc != tu) { - double ratio = 100.0 * (1.0 - static_cast(tc) / - static_cast(tu)); - std::printf(" Compression: %.2f%%\n", ratio); - } - - if (tv > 0) { + if (total_events > 0) { std::printf(" Avg Bytes/Event: %.2f bytes\n", - static_cast(tu) / static_cast(tv)); + static_cast(total_uncompressed) / + static_cast(total_events)); } - std::printf(" Processing Time: %.2f seconds\n", - duration.count() / 1000.0); + std::printf(" Processing Time: %.2f ms\n", + static_cast(overall.elapsed()) / 1e6); std::printf("==========================================\n"); - return (bad == 0) ? 0 : 1; + co_return (bad == 0) ? 0 : 1; } - // Detailed mode: per-file output (original behavior) - struct IndexedResult { - std::size_t index; - MetadataCollectorUtilityOutput info; - }; - - std::vector results; - std::mutex results_mutex; - - { - auto pipeline_config = PipelineConfig() - .with_name("DFTracer File Info") - .with_compute_threads(executor_threads) - .with_watchdog(false); - - Pipeline pipeline(pipeline_config); - - auto info_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - co_await ctx.scope([&](CoroScope& scope) - -> coro::CoroTask { - auto* files_ptr = &files; - auto* results_ptr = &results; - auto* results_mutex_ptr = &results_mutex; - - auto small_set = - std::make_shared>( - small_files); - - auto file_chan = - coro::make_channel(executor_threads * 2); - - scope.spawn( - [ch = file_chan->producer(), files_ptr]( - CoroScope&) mutable -> coro::CoroTask { - auto guard = ch.guard(); - for (std::size_t i = 0; i < files_ptr->size(); - ++i) { - if (!co_await ch.send(i)) { - co_return; - } - } - co_return; - }); - - for (std::size_t w = 0; w < executor_threads; ++w) { - scope.spawn([file_chan, files_ptr, checkpoint_size, - force_rebuild, verbose, index_dir, - small_set, results_ptr, results_mutex_ptr]( - CoroScope&) -> coro::CoroTask { - while (auto fi_opt = - co_await file_chan->receive()) { - std::size_t fi = *fi_opt; - const auto& file_path = (*files_ptr)[fi]; - bool is_small = small_set->count(file_path) > 0; - - MetadataCollectorUtilityOutput info; - if (is_small) { - info = co_await direct_scan_info(file_path); - } else { - auto input = - MetadataCollectorUtilityInput:: - from_file(file_path) - .with_checkpoint_size( - checkpoint_size) - .with_force_rebuild( - force_rebuild) - .with_compute_hash(verbose); - - if (!index_dir.empty()) { - input.with_index( - internal::determine_index_path( - file_path, index_dir)); - } - - MetadataCollectorUtility collector; - info = co_await collector.process(input); - } - - std::lock_guard lock( - *results_mutex_ptr); - results_ptr->push_back({fi, std::move(info)}); - } - co_return; - }); - } - co_return; - }); - co_return; - }, - "CollectInfo"); - - pipeline.set_source(info_task); - pipeline.set_destination(info_task); - pipeline.execute(); + for (const auto& r : all_results) { + print_file_info(r, verbose); } - std::sort(results.begin(), results.end(), - [](const IndexedResult& a, const IndexedResult& b) { - return a.index < b.index; - }); - - std::uint64_t total_compressed = 0; - std::uint64_t total_uncompressed = 0; - std::uint64_t total_lines = 0; - std::size_t successful = 0; - - for (const auto& r : results) { - print_file_info(r.info, verbose); - if (r.info.success) { - successful++; - total_compressed += r.info.compressed_size; - total_uncompressed += r.info.uncompressed_size; - total_lines += r.info.num_lines; + if (files.size() > 1) { + std::uint64_t total_uncompressed = 0; + std::uint64_t total_lines = 0; + std::size_t ok = 0; + + for (const auto& r : all_results) { + if (r.success) { + ok++; + total_uncompressed += r.uncompressed_size; + total_lines += r.num_lines; + } } - } - - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = end_time - start_time; - if (files.size() > 1) { std::printf("==========================================\n"); std::printf("Summary\n"); std::printf("==========================================\n"); std::printf("Total Files: %zu\n", files.size()); - std::printf("Successful: %zu\n", successful); - std::printf("Failed: %zu\n", files.size() - successful); + std::printf("Successful: %zu\n", ok); + std::printf("Failed: %zu\n", files.size() - ok); std::printf("Total Lines: %llu\n", (unsigned long long)total_lines); - std::printf("Total Compressed: %s\n", - format_size(total_compressed).c_str()); std::printf("Total Uncompressed: %s\n", format_size(total_uncompressed).c_str()); + std::printf("Processing Time: %.2f ms\n", + static_cast(overall.elapsed()) / 1e6); + } - if (total_uncompressed > 0) { - double ratio = - 100.0 * (1.0 - static_cast(total_compressed) / - static_cast(total_uncompressed)); - std::printf("Overall Compression: %.2f%%\n", ratio); - } + co_return 0; +} - std::printf("Processing Time: %.2f seconds\n", - duration.count() / 1000.0); - } +int main(int argc, char** argv) { + DFTRACER_UTILS_LOGGER_INIT(); + + argparse::ArgumentParser program("dftracer_info", + DFTRACER_UTILS_PACKAGE_VERSION); + program.add_description( + "Display metadata and index information for DFTracer compressed files " + "using composable utilities and pipeline processing"); + + InfoArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + + auto pipeline_config = + cli::build_pipeline_config("DFTracer Info", cli.pipeline); + Pipeline pipeline(pipeline_config); + + auto info_task = make_task( + [&cli](CoroScope& ctx) -> coro::CoroTask { + co_return co_await run_info(ctx, &cli); + }, + "InfoMain"); - return (successful == files.size()) ? 0 : 1; + pipeline.set_source(info_task); + pipeline.set_destination(info_task); + pipeline.execute(); + return info_task->get(); } diff --git a/src/dftracer/utils/binaries/dftracer_merge.cpp b/src/dftracer/utils/binaries/dftracer_merge.cpp index 50264115..5924a5f1 100644 --- a/src/dftracer/utils/binaries/dftracer_merge.cpp +++ b/src/dftracer/utils/binaries/dftracer_merge.cpp @@ -1,19 +1,96 @@ #include -#include #include -#include #include #include -#include #include #include -#include #include +#include "common_cli.h" + using namespace dftracer::utils; using namespace dftracer::utils::utilities::composites; +class MergeArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{cli::DirMode::DEFAULT_DOT, + "Directory containing .pfw or .pfw.gz files"}; + cli::PipelineArgs pipeline; + cli::WatchdogArgs watchdog; + + bool force = false; + std::string output; + bool compress = false; + bool verbose = false; + bool gzip_only = false; + bool verify = false; + std::size_t channel_capacity = 100; + std::size_t batch_size_kb = 256; + + explicit MergeArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + schema(directory, pipeline, watchdog); + } + + protected: + void register_args() override { + parser() + .add_argument("-f", "--force") + .help("Override existing output file and force index recreation") + .flag(); + + parser() + .add_argument("-o", "--output") + .help("Output file path (should have .pfw extension)") + .default_value("combined.pfw"); + + parser() + .add_argument("-c", "--compress") + .help("Compress output file with gzip") + .flag(); + + parser() + .add_argument("-v", "--verbose") + .help("Enable verbose mode") + .flag(); + + parser() + .add_argument("-g", "--gzip-only") + .help("Process only .pfw.gz files") + .flag(); + + parser() + .add_argument("--verify") + .help("Verify merged output by comparing input/output hashes") + .flag(); + + parser() + .add_argument("--channel-capacity") + .help("Channel buffer capacity for batch streaming (default: 100)") + .scan<'d', std::size_t>() + .default_value(static_cast(100)); + + parser() + .add_argument("--batch-size") + .help("Batch byte budget in KB (default: 256)") + .scan<'d', std::size_t>() + .default_value(static_cast(256)); + } + + void post_parse() override { + force = parser().get("--force"); + output = parser().get("--output"); + compress = parser().get("--compress"); + verbose = parser().get("--verbose"); + gzip_only = parser().get("--gzip-only"); + verify = parser().get("--verify"); + channel_capacity = parser().get("--channel-capacity"); + batch_size_kb = parser().get("--batch-size"); + } +}; + +static int run_merge(const MergeArgParse& cli); + int main(int argc, char** argv) { DFTRACER_UTILS_LOGGER_INIT(); @@ -23,117 +100,24 @@ int main(int argc, char** argv) { "Merge DFTracer .pfw or .pfw.gz files into a single JSON array file " "using streaming producer-consumer pattern"); - program.add_argument("-d", "--directory") - .help("Directory containing .pfw or .pfw.gz files") - .default_value("."); - - program.add_argument("-o", "--output") - .help("Output file path (should have .pfw extension)") - .default_value("combined.pfw"); - - program.add_argument("-f", "--force") - .help("Override existing output file and force index recreation") - .flag(); - - program.add_argument("-c", "--compress") - .help("Compress output file with gzip") - .flag(); - - program.add_argument("-v", "--verbose").help("Enable verbose mode").flag(); - - program.add_argument("-g", "--gzip-only") - .help("Process only .pfw.gz files") - .flag(); - - program.add_argument("--executor-threads") - .help( - "Number of executor threads for parallel processing (default: " - "number of CPU cores)") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - program.add_argument("--verify") - .help("Verify merged output by comparing input/output hashes") - .flag(); - - program.add_argument("--channel-capacity") - .help("Channel buffer capacity for batch streaming (default: 100)") - .scan<'d', std::size_t>() - .default_value(static_cast(100)); - - program.add_argument("--batch-size") - .help("Batch byte budget in KB (default: 256)") - .scan<'d', std::size_t>() - .default_value(static_cast(256)); - - program.add_argument("--disable-watchdog") - .help("Disable watchdog for hang detection") - .flag(); - - program.add_argument("--watchdog-global-timeout") - .help( - "Watchdog global timeout for pipeline execution in seconds (0 = no " - "timeout)") - .scan<'d', int>() - .default_value(0); - - program.add_argument("--watchdog-task-timeout") - .help("Watchdog default task timeout in seconds (0 = no timeout)") - .scan<'d', int>() - .default_value(0); - - program.add_argument("--watchdog-interval") - .help("Watchdog check interval in seconds") - .scan<'d', int>() - .default_value(1); - - program.add_argument("--watchdog-warning-threshold") - .help("Watchdog long-running task warning threshold in seconds") - .scan<'d', int>() - .default_value(300); - - program.add_argument("--watchdog-idle-timeout") - .help("Watchdog idle timeout in seconds (0 = use default)") - .scan<'d', int>() - .default_value(300); - - program.add_argument("--watchdog-deadlock-timeout") - .help("Watchdog deadlock timeout in seconds (0 = use default)") - .scan<'d', int>() - .default_value(600); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what()); - std::cerr << program << std::endl; - return 1; - } + MergeArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; - std::string input_dir = program.get("--directory"); - std::string output_file = program.get("--output"); - bool force_override = program.get("--force"); - bool compress_output = program.get("--compress"); - [[maybe_unused]] bool verbose = program.get("--verbose"); - bool gzip_only = program.get("--gzip-only"); - bool verify = program.get("--verify"); - std::size_t executor_threads = - program.get("--executor-threads"); - std::size_t channel_capacity = - program.get("--channel-capacity"); - std::size_t batch_size_kb = program.get("--batch-size"); - std::size_t batch_byte_budget = batch_size_kb * 1024; - bool disable_watchdog = program.get("--disable-watchdog"); - int global_timeout = program.get("--watchdog-global-timeout"); - int task_timeout = program.get("--watchdog-task-timeout"); - int watchdog_interval = program.get("--watchdog-interval"); - int warning_threshold = program.get("--watchdog-warning-threshold"); - int idle_timeout = program.get("--watchdog-idle-timeout"); - int deadlock_timeout = program.get("--watchdog-deadlock-timeout"); + return run_merge(cli); +} - input_dir = fs::absolute(input_dir).string(); - output_file = fs::absolute(output_file).string(); +static int run_merge(const MergeArgParse& cli) { + const auto input_dir = fs::absolute(cli.directory.value).string(); + const auto output_file = fs::absolute(cli.output).string(); + const auto force_override = cli.force; + const auto compress_output = cli.compress; + [[maybe_unused]] const auto verbose = cli.verbose; + const auto gzip_only = cli.gzip_only; + const auto verify = cli.verify; + const auto channel_capacity = cli.channel_capacity; + const auto batch_size_kb = cli.batch_size_kb; + std::size_t batch_byte_budget = batch_size_kb * 1024; if (output_file.size() < 4 || output_file.substr(output_file.size() - 4) != ".pfw") { @@ -194,16 +178,12 @@ int main(int argc, char** argv) { std::printf(" Verify: %s\n", verify ? "true" : "false"); std::printf(" Channel capacity: %zu\n", channel_capacity); std::printf(" Batch size: %zu KB\n", batch_size_kb); - std::printf(" Executor threads: %zu\n", executor_threads); + std::printf(" Executor threads: %zu\n", cli.pipeline.executor_threads); std::printf("==========================================\n\n"); auto start_time = std::chrono::high_resolution_clock::now(); - // Step 1: Create channel and buffer pool for streaming batches auto channel = coro::make_channel(channel_capacity); - // Pool size = channel capacity + num producers, so producers never block - // waiting for buffers (avoids deadlock when all executor threads are - // producers and the consumer can't run to release buffers). std::size_t pool_size = channel_capacity + input_files.size(); auto buf_pool = make_buffer_pool(pool_size, [batch_byte_budget]() { @@ -216,23 +196,10 @@ int main(int argc, char** argv) { producer_results.resize(input_files.size()); StreamingFileConsumerOutput consumer_result; - // Step 2: Create pipeline - auto pipeline_config = - PipelineConfig() - .with_name("DFTracer Merge") - .with_compute_threads(executor_threads) - .with_watchdog(!disable_watchdog) - .with_global_timeout(std::chrono::seconds(global_timeout)) - .with_task_timeout(std::chrono::seconds(task_timeout)) - .with_watchdog_interval(std::chrono::seconds(watchdog_interval)) - .with_warning_threshold(std::chrono::seconds(warning_threshold)) - .with_executor_idle_timeout(std::chrono::seconds(idle_timeout)) - .with_executor_deadlock_timeout( - std::chrono::seconds(deadlock_timeout)); - + auto pipeline_config = cli::build_pipeline_config( + "DFTracer Merge", cli.pipeline, cli.watchdog); Pipeline pipeline(pipeline_config); - // Step 3: Create producer tasks std::vector> producer_tasks; for (std::size_t i = 0; i < input_files.size(); ++i) { auto* input_files_ptr = &input_files; @@ -259,7 +226,6 @@ int main(int argc, char** argv) { producer_tasks.push_back(producer_task); } - // Step 4: Create consumer task auto* consumer_result_ptr = &consumer_result; auto consumer_task = make_task( [channel, buf_pool, output_file, compress_output, @@ -275,7 +241,6 @@ int main(int argc, char** argv) { }, "Consumer"); - // Step 5: Execute pipeline std::vector> all_tasks; all_tasks.insert(all_tasks.end(), producer_tasks.begin(), producer_tasks.end()); diff --git a/src/dftracer/utils/binaries/dftracer_organize.cpp b/src/dftracer/utils/binaries/dftracer_organize.cpp index 1a1f9259..8d78e4cc 100644 --- a/src/dftracer/utils/binaries/dftracer_organize.cpp +++ b/src/dftracer/utils/binaries/dftracer_organize.cpp @@ -1,49 +1,612 @@ +#include #include -#include #include -#include +#include +#include #include #include -#include #include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include #include #include -#include -#include +#include +#include +#include +#include +#include +#include -#include +#include #include -#include -#include +#include +#include #include +#include +#include #include +#include "common_cli.h" + using namespace dftracer::utils; using namespace dftracer::utils::utilities; using namespace dftracer::utils::utilities::composites; using namespace dftracer::utils::utilities::composites::dft; +using namespace dftracer::utils::utilities::composites::dft::indexing; using namespace dftracer::utils::utilities::composites::dft::reorganize; +using namespace dftracer::utils::utilities::composites::dft::aggregators; using namespace dftracer::utils::utilities::indexer; +class OrganizeArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{cli::DirMode::DEFAULT_EMPTY}; + cli::FilesArgs files_args; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + + std::string output_dir; + std::vector group_specs; + std::size_t chunk_size_mb = 0; // 0 = auto (one file/group on Lustre) + bool no_compress = false; + int compression_level = 1; + bool with_aggregation = false; + double time_interval_ms = 5000.0; + std::size_t memory_budget_mb = 0; // 0 = auto-detect + std::size_t estimated_file_bytes_mb = 0; // 0 = auto from input sizes + + explicit OrganizeArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.force_help = "Force rebuild of indices"; + schema(directory, files_args, pipeline, indexing); + } + + protected: + void register_args() override { + parser() + .add_argument("-o", "--output") + .help("Output directory") + .required(); + + parser() + .add_argument("--groups") + .help( + "Predicate groups: \"io:cat==\\\"POSIX\\\"\" " + "\"compute:cat==\\\"APP\\\"\"") + .nargs(argparse::nargs_pattern::at_least_one) + .required(); + + parser() + .add_argument("--chunk-size") + .help( + "Target chunk size in MB. 0 = auto: one file per group on " + "Lustre, 256 MB rotation elsewhere (default: 0)") + .scan<'d', std::size_t>() + .default_value(static_cast(0)); + + parser() + .add_argument("--no-compress") + .help("Write plain .pfw instead of .pfw.gz") + .flag(); + + parser() + .add_argument("--compression-level") + .help("Gzip compression level (0-9, default: 1)") + .scan<'d', int>() + .default_value(1); + + parser() + .add_argument("--with-aggregation") + .help( + "Build aggregation index on organized chunks so downstream " + "analyzers skip the first-read aggregation cost") + .flag(); + + parser() + .add_argument("--time-interval-ms") + .help( + "Aggregation bucket size in ms (used with --with-aggregation, " + "default: 5000)") + .scan<'g', double>() + .default_value(5000.0); + + parser() + .add_argument("--memory-budget-mb") + .help( + "Peak memory budget in MB for input file indexing. 0 = auto " + "(50%% of detected available memory). Bounds peak RSS on " + "large workloads by processing input files in batches.") + .scan<'d', std::size_t>() + .default_value(static_cast(0)); + + parser() + .add_argument("--estimated-file-bytes-mb") + .help( + "Per-file peak memory estimate in MB for index build. " + "0 = auto (sample input file sizes and apply " + "gzip/JSON expansion factor). Combined with " + "--memory-budget-mb to derive flush_every_files.") + .scan<'d', std::size_t>() + .default_value(static_cast(0)); + } + + void post_parse() override { + output_dir = parser().get("--output"); + group_specs = parser().get>("--groups"); + chunk_size_mb = parser().get("--chunk-size"); + no_compress = parser().get("--no-compress"); + compression_level = parser().get("--compression-level"); + with_aggregation = parser().get("--with-aggregation"); + time_interval_ms = parser().get("--time-interval-ms"); + memory_budget_mb = parser().get("--memory-budget-mb"); + estimated_file_bytes_mb = + parser().get("--estimated-file-bytes-mb"); + } +}; + namespace { -coro::CoroTask run_organize(const std::string& output_dir, - const std::string& index_dir, - const std::vector& files, - const std::vector& groups, - std::size_t checkpoint_size, - bool force_rebuild, bool no_compress, - std::size_t executor_threads, - std::size_t chunk_size_mb) { +// Forward decl: defined below at first use site. +using ChunkLayoutMap = std::unordered_map< + std::string, std::vector>; + +struct OrganizeResult { + std::size_t total_events_written = 0; + std::size_t total_events_unmatched = 0; + std::size_t chunks_created = 0; + std::size_t source_files_processed = 0; + std::vector output_files; + /// Per-chunk-file gzip-member layout, captured during Phase 3 by the + /// striped writer so Phase 4 indexing can slice without re-scanning. + ChunkLayoutMap chunk_layouts; + std::unordered_set inline_indexed_groups; + bool success = false; +}; + +struct GroupRuntime { + std::string name; + std::string group_index_dir; + std::string staging_root; + std::shared_ptr< + moodycamel::ConcurrentQueue> + artifacts_queue; + std::shared_ptr> batch_counter; + std::atomic indexed_inline{false}; +}; + +static coro::CoroTask run_group_writer_task( + CoroScope* inner_scope, GroupWriterConfig writer_config, + std::atomic* total_events_ptr, + std::atomic* chunks_ptr, + std::vector* output_files_ptr, std::mutex* output_mutex_ptr, + ChunkLayoutMap* chunk_layouts_ptr, GroupRuntime* runtime_ptr) { + auto writer_result = co_await run_group_writer(inner_scope, writer_config); + + if (writer_result.success) { + total_events_ptr->fetch_add(writer_result.events_written); + chunks_ptr->fetch_add(writer_result.chunks_created); + if (runtime_ptr) { + runtime_ptr->indexed_inline.store(writer_result.indexed_inline, + std::memory_order_release); + } + + std::lock_guard lock(*output_mutex_ptr); + for (const auto& f : writer_result.output_files) { + output_files_ptr->push_back(f); + } + if (chunk_layouts_ptr) { + for (auto& cl : writer_result.chunk_layouts) { + (*chunk_layouts_ptr)[cl.path] = std::move(cl.members); + } + } + } else { + DFTRACER_UTILS_LOG_ERROR("GroupWriter failed for %s: %s", + writer_config.group_name.c_str(), + writer_result.error_message.c_str()); + } +} + +static coro::CoroTask run_group_indexing( + CoroScope* scope, const std::string& group_output_dir, + const std::vector& chunk_files, + const AggregationConfig* agg_config, std::size_t checkpoint_size, + std::size_t parallelism, std::size_t flush_every_files, + const ChunkLayoutMap* writer_layouts) { + if (chunk_files.empty()) co_return; + + const std::string index_path = + dft::internal::determine_index_path(group_output_dir); + fs::create_directories(index_path); + + std::shared_ptr agg_db; + std::unique_ptr merger; + if (agg_config) { + agg_db = EventAggregator::open_with_merge_operator(index_path); + merger = std::make_unique(agg_db, /*config_hash=*/0u); + } + + std::vector chunk_file_ids; + { + IndexDatabase coord_db(index_path); + coord_db.init_schema(); + chunk_file_ids = + coord_db.register_files(chunk_files, /*build_manifest=*/true); + } + + // Per-chunk-file gzip member layout. Prefer writer-captured layout (no + // I/O, exact); fall back to a post-write scan for chunks the writer + // didn't track (sharded/padded layouts return empty spans). + auto member_map = std::make_shared>>( + chunk_files.size()); + std::size_t scanned = 0; + std::size_t from_writer = 0; + for (std::size_t fi = 0; fi < chunk_files.size(); ++fi) { + if (writer_layouts) { + auto it = writer_layouts->find(chunk_files[fi]); + if (it != writer_layouts->end() && !it->second.empty()) { + auto& dst = (*member_map)[fi]; + dst.reserve(it->second.size()); + for (const auto& span : it->second) { + dst.push_back({span.offset, span.length}); + } + ++from_writer; + continue; + } + } + int fd = ::open(chunk_files[fi].c_str(), O_RDONLY); + if (fd < 0) continue; + struct stat st; + if (::fstat(fd, &st) == 0 && st.st_size >= 18) { + co_await dftracer::utils::utilities::indexer::internal:: + enumerate_gzip_member_candidates( + fd, static_cast(st.st_size), + (*member_map)[fi]); + } + ::close(fd); + ++scanned; + } + DFTRACER_UTILS_LOG_INFO( + "Phase 4 group '%s': layouts from writer=%zu, rescanned=%zu", + group_output_dir.c_str(), from_writer, scanned); + + // Build per-file slices targeting + std::vector sliced_file_paths; + std::vector sliced_file_ids; + std::vector sliced_slices; + for (std::size_t fi = 0; fi < chunk_files.size(); ++fi) { + const auto& members = (*member_map)[fi]; + if (members.size() <= 1) { + sliced_file_paths.push_back(chunk_files[fi]); + sliced_file_ids.push_back(chunk_file_ids[fi]); + sliced_slices.push_back({}); + continue; + } + std::uint64_t total_c = 0; + for (const auto& m : members) total_c += m.c_size; + const std::size_t target_units = + std::max(parallelism, std::size_t(1)); + const std::uint64_t target_c = + (total_c + target_units - 1) / target_units; + std::size_t begin = 0; + std::uint64_t accum = 0; + bool first_slice_for_file = true; + for (std::size_t i = 0; i < members.size(); ++i) { + accum += members[i].c_size; + const bool is_last = (i + 1 == members.size()); + if ((target_c > 0 && accum >= target_c) || is_last) { + IndexBuildBatchConfig::FileSlice s; + s.members = &(*member_map)[fi]; + s.member_begin = begin; + s.member_end = i + 1; + constexpr std::uint64_t CKPT_STRIDE = 1u << 20; + s.checkpoint_idx_base = + static_cast(begin) * CKPT_STRIDE; + // Only the first slice persists file-scoped data + // (chunk_bloom/file_bloom/manifest/file_metadata). Subsequent + // slices contribute aggregation/system_metrics SSTs only. + s.skip_file_scoped_writes = !first_slice_for_file; + first_slice_for_file = false; + + sliced_file_paths.push_back(chunk_files[fi]); + sliced_file_ids.push_back(chunk_file_ids[fi]); + sliced_slices.push_back(s); + begin = i + 1; + accum = 0; + } + } + } + DFTRACER_UTILS_LOG_INFO( + "Phase 4 group indexing: %zu chunk files -> %zu slices " + "(parallelism=%zu)", + chunk_files.size(), sliced_file_paths.size(), parallelism); + + const std::string staging_root = + (fs::path(index_path) / ".dftindex_staging").string(); + fs::create_directories(staging_root); + auto artifacts_queue = std::make_shared>(); + auto batch_counter = std::make_shared>(0); + + auto batch_config = std::make_shared(); + batch_config->file_paths = std::move(sliced_file_paths); + batch_config->preassigned_file_ids = std::move(sliced_file_ids); + batch_config->file_slices = std::move(sliced_slices); + batch_config->index_dir = group_output_dir; + batch_config->checkpoint_size = checkpoint_size; + batch_config->parallelism = parallelism; + batch_config->force_rebuild = false; + batch_config->build_manifest = true; + batch_config->use_batch_write = true; + batch_config->flush_every_files = flush_every_files; + batch_config->sink_factory = + [staging_root, batch_counter]() -> std::unique_ptr { + const std::size_t idx = + batch_counter->fetch_add(1, std::memory_order_relaxed); + return std::make_unique( + staging_root, "batch_" + std::to_string(idx)); + }; + batch_config->sink_commit = [artifacts_queue](IndexBatchSink& sink) { + auto& sst = static_cast(sink); + auto a = sst.commit(); + if (!a.empty()) artifacts_queue->enqueue(std::move(a)); + }; + + if (agg_config) { + auto agg_config_ptr = std::make_shared(*agg_config); + batch_config->dft_visitor_factory = + [agg_db, agg_config_ptr](const std::string& file_path) + -> std::vector> { + std::vector> + visitors; + visitors.push_back(std::make_unique( + agg_db, /*config_hash=*/0u, *agg_config_ptr, file_path)); + return visitors; + }; + auto* merger_ptr = merger.get(); + batch_config->extra_visitors_drain = + [merger_ptr](std::vector>> + per_file) { + for (auto& file_visitors : per_file) { + for (auto& visitor : file_visitors) { + auto* agg_visitor = + dynamic_cast(visitor.get()); + if (!agg_visitor) continue; + for (const auto& k : agg_visitor->observed_extra_keys()) + merger_ptr->add_observed_extra_key(k); + for (const auto& m : + agg_visitor->observed_custom_metrics()) + merger_ptr->add_observed_custom_metric(m); + merger_ptr->merge_chunk(agg_visitor->take_output()); + } + } + }; + } + + auto batch_result = co_await IndexBatchBuilderUtility::process( + scope, std::move(batch_config)); + + { + SstArtifactRegistry registry; + IndexDatabaseSstWriterContext::Artifacts a; + while (artifacts_queue->try_dequeue(a)) { + registry.append(std::move(a)); + } + IndexDatabase ingest_db(index_path); + ingest_db.bulk_ingest(registry, {}); + std::error_code ec; + fs::remove_all(staging_root, ec); + } + + if (!agg_config) co_return; + + namespace rcf = dftracer::utils::rocksdb::cf; + IndexDatabase idx_db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + + auto batch = agg_db->begin_batch(); + AggGlobalConfig global_cfg; + global_cfg.time_interval_us = agg_config->time_interval_us; + global_cfg.config_hash = 0; + agg_db->put(batch, rcf::AGGREGATION, + std::string_view(AGG_GLOBAL_CONFIG_KEY, 2), + serialize_agg_global_config(global_cfg)); + for (const auto& chunk_path : chunk_files) { + int file_id = idx_db.find_file(chunk_path); + if (file_id >= 0) { + auto key = make_agg_file_key(file_id); + agg_db->put(batch, rcf::AGGREGATION, key, ""); + } + } + agg_db->commit_batch(batch); +} + +static coro::CoroTask run_manifest_extractor_task( + ManifestExtractorConfig extractor_config) { + auto extract_result = co_await extract_from_manifest(extractor_config); + if (!extract_result.success) { + DFTRACER_UTILS_LOG_WARN("ManifestExtractor failed for %s: %s", + extractor_config.file_path.c_str(), + extract_result.error_message.c_str()); + } +} + +struct ProducerScopeInput { + ResolverResult resolver_result; + std::vector files_needing_index; + std::vector + manifest_entries; // Files with manifest for extraction + std::uint64_t checkpoint_size; + std::size_t executor_threads; + bool force_rebuild; + std::size_t flush_every_files; // per-flush sub-batch inside indexer + std::vector groups; + std::vector>>> + group_channels; + std::unordered_map file_index_map; +}; + +static coro::CoroTask run_producer_scope(CoroScope* producer_scope, + ProducerScopeInput* input) { + if (!input->files_needing_index.empty()) { + const std::size_t total_files = input->files_needing_index.size(); + const std::size_t flush_every = + std::max(input->flush_every_files, std::size_t(1)); + std::printf( + " Processing %zu files needing index (flush_every=%zu)...\n", + total_files, flush_every); + + auto factory = [input](const std::string& file_path) + -> std::vector> { + std::size_t file_idx = 0; + if (auto it = input->file_index_map.find(file_path); + it != input->file_index_map.end()) { + file_idx = it->second; + } + + OrganizeVisitorConfig visitor_config; + visitor_config.groups = input->groups; + visitor_config.group_channels = input->group_channels; + visitor_config.source_file_idx = file_idx; + + std::vector> visitors; + visitors.push_back( + std::make_unique(std::move(visitor_config))); + return visitors; + }; + + auto batch_config = std::make_shared(); + batch_config->file_paths.reserve(total_files); + for (const auto& item : input->files_needing_index) { + batch_config->file_paths.push_back(item.file_path); + } + const std::string& input_index_path = input->resolver_result.index_path; + + std::vector preassigned_file_ids; + { + IndexDatabase coord_db(input_index_path); + coord_db.init_schema(); + preassigned_file_ids = coord_db.register_files( + batch_config->file_paths, /*build_manifest=*/true); + } + const std::string staging_root = + (fs::path(input_index_path) / ".dftindex_staging").string(); + fs::create_directories(staging_root); + auto artifacts_queue = std::make_shared>(); + auto batch_counter = std::make_shared>(0); + + batch_config->preassigned_file_ids = std::move(preassigned_file_ids); + batch_config->index_dir = input_index_path; + batch_config->checkpoint_size = input->checkpoint_size; + batch_config->parallelism = input->executor_threads; + batch_config->force_rebuild = input->force_rebuild; + batch_config->build_manifest = true; + batch_config->use_batch_write = true; + batch_config->rebuild_root_summaries = false; + batch_config->flush_every_files = flush_every; + batch_config->dft_visitor_factory = factory; + batch_config->sink_factory = + [staging_root, batch_counter]() -> std::unique_ptr { + const std::size_t idx = + batch_counter->fetch_add(1, std::memory_order_relaxed); + return std::make_unique( + staging_root, "batch_" + std::to_string(idx)); + }; + batch_config->sink_commit = [artifacts_queue](IndexBatchSink& sink) { + auto& sst = static_cast(sink); + auto a = sst.commit(); + if (!a.empty()) artifacts_queue->enqueue(std::move(a)); + }; + + co_await IndexBatchBuilderUtility::process(producer_scope, + batch_config); + + SstArtifactRegistry registry; + { + IndexDatabaseSstWriterContext::Artifacts a; + while (artifacts_queue->try_dequeue(a)) { + registry.append(std::move(a)); + } + } + { + IndexDatabase ingest_db(input_index_path); + ingest_db.bulk_ingest(registry, {}); + ingest_db.rebuild_root_summaries(); + } + std::error_code ec; + fs::remove_all(staging_root, ec); + } + + if (!input->manifest_entries.empty()) { + std::printf(" Processing %zu files via manifest extraction...\n", + input->manifest_entries.size()); + + const auto& index_path = input->resolver_result.index_path; + for (const auto& entry : input->manifest_entries) { + ManifestExtractorConfig extractor_config; + extractor_config.file_path = entry.file_path; + extractor_config.index_path = index_path; + extractor_config.source_file_idx = entry.file_index; + extractor_config.groups = input->groups; + extractor_config.group_channels = input->group_channels; + + producer_scope->spawn( + [extractor_config](CoroScope&) -> coro::CoroTask { + co_await run_manifest_extractor_task(extractor_config); + }); + } + } +} + +coro::CoroTask run_organize(const OrganizeArgParse* cli) { + const auto& output_dir = cli->output_dir; + const auto& index_dir = cli->indexing.index_dir; + const auto checkpoint_size = cli->indexing.checkpoint_size; + const auto force_rebuild = cli->indexing.force; + const auto no_compress = cli->no_compress; + const auto compression_level = cli->compression_level; + const auto executor_threads = cli->pipeline.executor_threads; + const bool time_profiling = cli->pipeline.time_profiling; + + auto groups = parse_group_specs(cli->group_specs); + if (groups.empty()) { + DFTRACER_UTILS_LOG_ERROR("%s", "No groups specified."); + co_return 1; + } + + fs::create_directories(output_dir); + + std::size_t chunk_size_mb = cli->chunk_size_mb; + if (chunk_size_mb == 0) { + auto layout = fileio::parallel::detect_layout(output_dir); + if (layout.fs == fileio::parallel::FilesystemKind::LUSTRE) { + chunk_size_mb = 0; // single file per group, no rotation + } else { + chunk_size_mb = 256; + } + } + std::printf("==========================================\n"); - std::printf("DFTracer Trace Reorganizer\n"); + std::printf("DFTracer Trace Reorganizer (Streaming)\n"); std::printf("==========================================\n"); - std::printf(" Input files: %zu\n", files.size()); std::printf(" Output directory: %s\n", output_dir.c_str()); - std::printf(" Chunk size: %zu MB\n", chunk_size_mb); + if (chunk_size_mb == 0) { + std::printf(" Chunk size: auto (one file per group)\n"); + } else { + std::printf(" Chunk size: %zu MB\n", chunk_size_mb); + } std::printf(" Compress: %s\n", no_compress ? "false" : "true"); std::printf(" Executor threads: %zu\n", executor_threads); std::printf(" Groups: %zu\n", groups.size()); @@ -53,203 +616,444 @@ coro::CoroTask run_organize(const std::string& output_dir, } std::printf("==========================================\n\n"); - auto start_time = std::chrono::high_resolution_clock::now(); + Timer stages_storage("dftracer_organize"); + Timer* stages = time_profiling ? &stages_storage : nullptr; + Timer overall(true); - // Step 1: Build indices - std::printf("Step 1: Building indices...\n"); - { - auto pipeline_config = PipelineConfig() - .with_name("Organize: Build IDX") - .with_compute_threads(executor_threads) - .with_watchdog(false); - - Pipeline pipeline(pipeline_config); - - std::atomic built_count{0}; - std::atomic skipped_count{0}; - - auto build_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - co_await ctx.scope([&](CoroScope& scope) - -> coro::CoroTask { - auto* built_ptr = &built_count; - auto* skipped_ptr = &skipped_count; - for (std::size_t i = 0; i < files.size(); ++i) { - const auto file_path = files[i]; - scope.spawn([file_path, index_dir, checkpoint_size, - force_rebuild, built_ptr, skipped_ptr]( - CoroScope&) -> coro::CoroTask { - auto config = - IndexBuildConfig::for_file(file_path) - .with_index_dir(index_dir) - .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(force_rebuild) - .with_manifest(true) - .with_index_threshold(0); - - IndexBuilderUtility builder; - auto result = co_await builder.process(config); - - if (result.was_skipped) { - (*skipped_ptr)++; - } else if (result.success) { - (*built_ptr)++; - } else { - DFTRACER_UTILS_LOG_ERROR( - "IDX build failed for %s: %s", - file_path.c_str(), - result.error_message.c_str()); - } - co_return; - }); - } - co_return; - }); + OrganizeResult result; + + auto pipeline_config = + cli::build_pipeline_config("Organize: Streaming", cli->pipeline); + + Pipeline pipeline(pipeline_config); + + auto* cli_ptr = cli; + auto* groups_ptr = &groups; + auto* result_ptr = &result; + + auto organize_task = make_task( + [cli_ptr, groups_ptr, result_ptr, output_dir, index_dir, + checkpoint_size, force_rebuild, no_compress, compression_level, + executor_threads, chunk_size_mb, + stages](CoroScope& ctx) -> coro::CoroTask { + // Phase 1: Scan & Partition + DFTRACER_UTILS_LOG_INFO("%s", "Phase 1 begin: scan & partition"); + std::printf("Phase 1: Scanning and partitioning files...\n"); + + IndexResolverUtility resolver; + ResolverInput resolver_input; + ResolverResult resolve_result; + { + ScopedTimer _t(stages, "phase1_scan_partition"); + resolver_input.directory = cli_ptr->directory.value; + resolver_input.files = cli_ptr->files_args.value; + resolver_input.index_dir = index_dir; + resolver_input.require_manifest = true; + + resolve_result = co_await resolver.process(resolver_input); + } + + if (resolve_result.all_files.empty()) { + DFTRACER_UTILS_LOG_ERROR( + "%s", "No input files. Use --files or --directory."); co_return; - }, - "BuildIDX"); + } - pipeline.set_source(build_task); - pipeline.set_destination(build_task); - pipeline.execute(); + std::printf(" Total files: %zu\n", + resolve_result.all_files.size()); + std::printf(" Files needing index: %zu\n", + resolve_result.needs_checkpoint.size() + + resolve_result.needs_manifest.size()); + std::printf(" Already indexed: %zu\n", + resolve_result.cached.size()); + DFTRACER_UTILS_LOG_INFO( + "Phase 1 complete: %zu files (%zu need index, %zu cached)", + resolve_result.all_files.size(), + resolve_result.needs_checkpoint.size() + + resolve_result.needs_manifest.size(), + resolve_result.cached.size()); - std::printf(" Built: %zu, Skipped: %zu\n", built_count.load(), - skipped_count.load()); - } + // Build file index map for source_file_idx lookup + std::unordered_map file_index_map; + for (std::size_t i = 0; i < resolve_result.all_files.size(); ++i) { + file_index_map[resolve_result.all_files[i]] = i; + } - // Step 2: Build extraction plan - std::printf("Step 2: Building extraction plan...\n"); - ReorganizationPlannerUtility planner; - ReorganizationPlannerInput planner_input; - planner_input.source_files = files; - planner_input.groups = groups; - planner_input.index_dir = index_dir; - planner_input.checkpoint_size = checkpoint_size; - - ExtractionPlan plan; - try { - plan = co_await planner.process(planner_input); - } catch (const std::exception& e) { - DFTRACER_UTILS_LOG_ERROR("Planning failed: %s", e.what()); - co_return 1; - } + // Build source file info for provenance tracking + std::vector source_files; + source_files.reserve(resolve_result.all_files.size()); + for (std::size_t i = 0; i < resolve_result.all_files.size(); ++i) { + source_files.push_back(SourceFileInfo{ + .file_path = resolve_result.all_files[i], + .index_path = resolve_result.index_path, + .num_checkpoints = 0, + }); + } - std::printf(" Groups: %zu\n", plan.groups.size()); - std::printf(" Source files: %zu\n", plan.source_files.size()); - std::printf(" Extraction tasks: %zu\n", plan.tasks.size()); - std::printf(" Total events: %zu\n", plan.total_events); + // Phase 2: Setup channels and writers + DFTRACER_UTILS_LOG_INFO("%s", + "Phase 2 begin: setup streaming pipeline"); + std::printf("Phase 2: Starting streaming pipeline...\n"); - if (plan.tasks.empty()) { - std::printf("No events to extract.\n"); - co_return 0; - } + std::vector< + std::shared_ptr>>> + group_channels; + std::atomic total_events_written{0}; + std::atomic chunks_created{0}; + std::vector all_output_files; + std::mutex output_files_mutex; + // Each group writer writes distinct chunk paths, so concurrent + // inserts into chunk_layouts are key-disjoint and safe under + // output_files_mutex (already taken when appending output_files). + // Stored on result_ptr so Phase 4 (separate task) can read it. - // Step 3: Route events (parallel) - std::printf("Step 3: Routing events...\n"); - EventRouterResult router_result; - { - auto pipeline_config = PipelineConfig() - .with_name("Organize: Route Events") - .with_compute_threads(executor_threads) - .with_watchdog(false); - - Pipeline pipeline(pipeline_config); - - EventRouterConfig router_config; - router_config.plan = std::move(plan); - router_config.output_dir = output_dir; - router_config.index_dir = index_dir; - router_config.chunk_size_bytes = chunk_size_mb * 1024 * 1024; - router_config.checkpoint_size = checkpoint_size; - router_config.executor_threads = executor_threads; - router_config.compress = !no_compress; - - auto* router_config_ptr = &router_config; - auto* router_result_ptr = &router_result; - - auto route_task = make_task( - [router_config_ptr, - router_result_ptr](CoroScope& scope) -> coro::CoroTask { - *router_result_ptr = - co_await route_events(scope, *router_config_ptr); - }, - "RouteEvents"); - - pipeline.set_source(route_task); - pipeline.set_destination(route_task); - pipeline.execute(); - } + { + ScopedTimer _t(stages, "phase2_setup_channels"); + group_channels.reserve(groups_ptr->size()); + + for (std::size_t i = 0; i < groups_ptr->size(); ++i) { + group_channels.push_back( + std::make_shared< + coro::Channel>>( + executor_threads * 4)); + } + } - std::printf(" Events written: %zu\n", router_result.total_events_written); - std::printf(" Chunks created: %zu\n", router_result.chunks_created); - std::printf(" Source files processed: %zu\n", - router_result.source_files_processed); + auto* total_events_ptr = &total_events_written; + auto* chunks_ptr = &chunks_created; + auto* output_files_ptr = &all_output_files; + auto* output_mutex_ptr = &output_files_mutex; + auto* chunk_layouts_ptr = &result_ptr->chunk_layouts; + const auto* source_files_ptr = &source_files; - // Step 4: Build `.dftindex` stores for output chunk files. - if (!router_result.output_files.empty()) { - std::printf("Step 4: Building .dftindex stores...\n"); - auto pipeline_config = PipelineConfig() - .with_name("Organize: Build Index Stores") - .with_compute_threads(executor_threads) - .with_watchdog(false); + DFTRACER_UTILS_LOG_INFO("%s", "Phase 2 complete"); + DFTRACER_UTILS_LOG_INFO("%s", + "Phase 3 begin: producers + group writers"); + std::vector> group_runtimes; + group_runtimes.reserve(groups_ptr->size()); + for (const auto& group : *groups_ptr) { + auto rt = std::make_unique(); + rt->name = group.name; + const std::string group_output_dir = + output_dir + "/" + group.name; + fs::create_directories(group_output_dir); + rt->group_index_dir = + dft::internal::determine_index_path(group_output_dir); + fs::create_directories(rt->group_index_dir); + rt->staging_root = + (fs::path(rt->group_index_dir) / ".dftindex_staging") + .string(); + fs::create_directories(rt->staging_root); + rt->artifacts_queue = + std::make_shared>(); + rt->batch_counter = + std::make_shared>(0); + group_runtimes.push_back(std::move(rt)); + } - Pipeline pipeline(pipeline_config); + for (std::size_t g = 0; g < groups_ptr->size(); ++g) { + const auto& group = (*groups_ptr)[g]; + auto channel = group_channels[g]; + GroupRuntime* runtime = group_runtimes[g].get(); - auto* output_files_ptr = &router_result.output_files; + GroupWriterConfig writer_config; + writer_config.group_name = group.name; + writer_config.group_query = group.query; + writer_config.output_dir = output_dir; + writer_config.chunk_size_bytes = chunk_size_mb * 1024 * 1024; + writer_config.compress = !no_compress; + writer_config.compression_level = compression_level; + writer_config.input_channel = channel; + writer_config.source_files = source_files_ptr; + writer_config.build_output_index = true; + writer_config.index_dir = runtime->group_index_dir; + writer_config.staging_root = runtime->staging_root; + writer_config.artifacts_queue = runtime->artifacts_queue; + writer_config.batch_counter = runtime->batch_counter; + writer_config.with_aggregation = cli_ptr->with_aggregation; + writer_config.agg_time_interval_us = + cli_ptr->time_interval_ms * 1000.0; + writer_config.bloom_dimensions = std::vector( + indexer::DEFAULT_BLOOM_DIMENSIONS.begin(), + indexer::DEFAULT_BLOOM_DIMENSIONS.end()); + writer_config.bloom_config.build_manifest = true; - auto index_store_task = make_task( - [output_files_ptr, output_dir, - checkpoint_size](CoroScope& ctx) -> coro::CoroTask { + ctx.spawn( + [writer_config, total_events_ptr, chunks_ptr, + output_files_ptr, output_mutex_ptr, chunk_layouts_ptr, + runtime](CoroScope& inner_scope) -> coro::CoroTask { + co_await run_group_writer_task( + &inner_scope, writer_config, total_events_ptr, + chunks_ptr, output_files_ptr, output_mutex_ptr, + chunk_layouts_ptr, runtime); + }); + } + + // Phase 3b & 3c: Run producers in a nested scope + // When this scope completes, all producers have finished + const auto total_source_files = resolve_result.all_files.size(); + const std::size_t memory_budget = compute_memory_budget( + cli_ptr->memory_budget_mb * 1024ULL * 1024ULL); + const std::size_t per_file_bytes = estimate_per_file_bytes( + resolve_result.all_file_sizes, + cli_ptr->estimated_file_bytes_mb * 1024ULL * 1024ULL); + const std::size_t phase3_flush_every = + compute_file_batch_size(memory_budget, per_file_bytes, 4); + { + ScopedTimer _t(stages, "phase3_producers"); + auto producer_input = std::make_shared(); + producer_input->resolver_result = std::move(resolve_result); + producer_input->files_needing_index = + std::move(producer_input->resolver_result.needs_checkpoint); + for (auto& item : + producer_input->resolver_result.needs_manifest) { + producer_input->files_needing_index.push_back( + std::move(item)); + } + producer_input->manifest_entries = + std::move(producer_input->resolver_result.cached); + producer_input->checkpoint_size = checkpoint_size; + producer_input->executor_threads = executor_threads; + producer_input->force_rebuild = force_rebuild; + producer_input->flush_every_files = phase3_flush_every; + std::printf( + " Memory budget: %.2f GB; per-file peak estimate: %.2f " + "GB; flush_every: %zu files\n", + memory_budget / (1024.0 * 1024.0 * 1024.0), + per_file_bytes / (1024.0 * 1024.0 * 1024.0), + phase3_flush_every); + producer_input->groups = *groups_ptr; + producer_input->group_channels = group_channels; + producer_input->file_index_map = std::move(file_index_map); + + // GCC 12 coroutine bug: capturing shared_ptr by value in + // coroutine lambdas corrupts refcount. Capture raw pointer + // instead - lifetime is guaranteed by outer shared_ptr. + auto* producer_input_raw = producer_input.get(); co_await ctx.scope( - [&](CoroScope& scope) -> coro::CoroTask { - for (const auto& out_file : *output_files_ptr) { - scope.spawn([out_file, checkpoint_size](CoroScope&) - -> coro::CoroTask { - auto config = - IndexBuildConfig::for_file(out_file) - .with_index_dir("") - .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(true) - .with_manifest(true) - .with_index_threshold(0); - - IndexBuilderUtility builder; - co_await builder.process(config); - co_return; - }); - } - co_return; + [producer_input_raw]( + CoroScope& producer_scope) -> coro::CoroTask { + co_await run_producer_scope(&producer_scope, + producer_input_raw); }); - co_return; - }, - "BuildSidecars"); + } + + // Producers done - close channels to signal EOF to writers + for (auto& channel : group_channels) { + channel->close(); + } + DFTRACER_UTILS_LOG_INFO("%s", + "Phase 3 producers complete; waiting for " + "group writers to drain"); + + // Wait for all writers to complete + co_await ctx.join_all(); + + for (auto& rt : group_runtimes) { + if (!rt->indexed_inline.load(std::memory_order_acquire)) { + continue; + } + SstArtifactRegistry registry; + IndexDatabaseSstWriterContext::Artifacts a; + while (rt->artifacts_queue->try_dequeue(a)) { + registry.append(std::move(a)); + } + IndexDatabase ingest_db(rt->group_index_dir); + ingest_db.bulk_ingest(registry, {}); + std::error_code ec; + fs::remove_all(rt->staging_root, ec); + result_ptr->inline_indexed_groups.insert(rt->name); + } + + result_ptr->total_events_written = total_events_written.load(); + result_ptr->chunks_created = chunks_created.load(); + result_ptr->source_files_processed = total_source_files; + result_ptr->output_files = std::move(all_output_files); + + result_ptr->success = true; + DFTRACER_UTILS_LOG_INFO( + "Phase 3 complete: %zu chunks created, %zu events written", + result_ptr->chunks_created, result_ptr->total_events_written); + }, + "OrganizeStreaming"); + + auto index_task = make_task( + [cli_ptr, groups_ptr, result_ptr, output_dir, checkpoint_size, + executor_threads](CoroScope& ctx) -> coro::CoroTask { + if (!result_ptr->success) co_return; + + AggregationConfig agg_config; + const AggregationConfig* agg_ptr = nullptr; + if (cli_ptr->with_aggregation) { + agg_config.time_interval_us = static_cast( + cli_ptr->time_interval_ms * 1000.0); + agg_config.compute_statistics = true; + agg_config.track_process_parents = true; + agg_config.track_default_args = true; + agg_ptr = &agg_config; + DFTRACER_UTILS_LOG_INFO( + "Phase 4 begin: indexes + aggregation " + "(time_interval=%.2f ms)", + cli_ptr->time_interval_ms); + std::printf( + "Phase 4: Building indexes + aggregation " + "(time_interval=%.2f ms) ...\n", + cli_ptr->time_interval_ms); + } else { + DFTRACER_UTILS_LOG_INFO("%s", "Phase 4 begin: indexes"); + std::printf("Phase 4: Building indexes ...\n"); + } + + const std::size_t phase4_memory_budget = compute_memory_budget( + cli_ptr->memory_budget_mb * 1024ULL * 1024ULL); + const std::size_t override_per_file_bytes = + cli_ptr->estimated_file_bytes_mb * 1024ULL * 1024ULL; + + filesystem::PatternDirectoryScannerUtility chunk_scanner; + for (const auto& group : *groups_ptr) { + const std::string group_dir = output_dir + "/" + group.name; + if (!fs::exists(group_dir)) continue; + if (result_ptr->inline_indexed_groups.count(group.name)) { + DFTRACER_UTILS_LOG_INFO( + "Phase 4: skipping group '%s' (indexed inline)", + group.name.c_str()); + continue; + } + + filesystem::PatternDirectoryScannerUtilityInput scan_input{ + group_dir, {".pfw", ".pfw.gz"}, false}; + auto entries = co_await chunk_scanner.process(scan_input); + if (entries.empty()) continue; + + std::sort(entries.begin(), entries.end(), + [](const filesystem::FileEntry& a, + const filesystem::FileEntry& b) { + return a.path.string() < b.path.string(); + }); + + std::vector chunk_files; + std::vector chunk_sizes; + chunk_files.reserve(entries.size()); + chunk_sizes.reserve(entries.size()); + for (const auto& e : entries) { + chunk_files.push_back(e.path.string()); + chunk_sizes.push_back(e.size); + } + + const std::size_t per_file_bytes = estimate_per_file_bytes( + chunk_sizes, override_per_file_bytes); + const std::size_t flush_every = compute_file_batch_size( + phase4_memory_budget, per_file_bytes, 4); + std::printf( + " %s: %zu chunks; per-file peak: %.2f GB; " + "flush_every: %zu chunks\n", + group.name.c_str(), chunk_files.size(), + per_file_bytes / (1024.0 * 1024.0 * 1024.0), flush_every); + DFTRACER_UTILS_LOG_INFO( + "Phase 4: indexing group '%s' (%zu chunks)", + group.name.c_str(), chunk_files.size()); + co_await run_group_indexing( + &ctx, group_dir, chunk_files, agg_ptr, checkpoint_size, + executor_threads, flush_every, &result_ptr->chunk_layouts); + DFTRACER_UTILS_LOG_INFO("Phase 4: group '%s' complete", + group.name.c_str()); + } + DFTRACER_UTILS_LOG_INFO("%s", "Phase 4 complete"); + }, + "OrganizeIndexing"); + + index_task->depends_on(organize_task); + + pipeline.set_source(organize_task); + pipeline.set_destination(index_task); + pipeline.execute(); - pipeline.set_source(index_store_task); - pipeline.set_destination(index_store_task); - pipeline.execute(); + if (result.success) { + const std::string manifest_path = output_dir + "/manifest.json"; + std::ofstream manifest_out(manifest_path); + if (manifest_out.is_open()) { + auto escape = [](const std::string& s) { + std::string out; + out.reserve(s.size()); + for (char c : s) { + switch (c) { + case '"': + out += "\\\""; + break; + case '\\': + out += "\\\\"; + break; + case '\n': + out += "\\n"; + break; + case '\r': + out += "\\r"; + break; + case '\t': + out += "\\t"; + break; + default: + out += c; + break; + } + } + return out; + }; + manifest_out << "{\n"; + manifest_out << " \"version\": 1,\n"; + manifest_out << " \"tool\": \"dftracer_organize\",\n"; + manifest_out << " \"groups\": {\n"; + for (std::size_t i = 0; i < groups.size(); ++i) { + manifest_out << " \"" << escape(groups[i].name) << "\": \"" + << escape(groups[i].name) << "\""; + if (i + 1 < groups.size()) manifest_out << ","; + manifest_out << "\n"; + } + manifest_out << " },\n"; + manifest_out << " \"group_queries\": {\n"; + for (std::size_t i = 0; i < groups.size(); ++i) { + manifest_out << " \"" << escape(groups[i].name) << "\": \"" + << escape(groups[i].query) << "\""; + if (i + 1 < groups.size()) manifest_out << ","; + manifest_out << "\n"; + } + manifest_out << " }\n"; + manifest_out << "}\n"; + } else { + DFTRACER_UTILS_LOG_WARN("Failed to write manifest at %s", + manifest_path.c_str()); + } } - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = end_time - start_time; + overall.stop(); + double duration_ms = static_cast(overall.elapsed()) / 1e6; std::printf("\n==========================================\n"); std::printf("Reorganization Complete\n"); std::printf("==========================================\n"); - std::printf(" Time: %.2f seconds\n", duration.count() / 1000.0); - std::printf(" Input files: %zu\n", files.size()); - std::printf(" Events routed: %zu\n", router_result.total_events_written); - std::printf(" Chunks created: %zu\n", router_result.chunks_created); + std::printf(" Time: %.2f seconds\n", duration_ms / 1000.0); + std::printf(" Input files: %zu\n", result.source_files_processed); + std::printf(" Events routed: %zu\n", result.total_events_written); + std::printf(" Chunks created: %zu\n", result.chunks_created); + if (result.success) { + std::printf(" Manifest: %s/manifest.json\n", output_dir.c_str()); + } std::printf(" Output files:\n"); - for (const auto& f : router_result.output_files) { + for (const auto& f : result.output_files) { if (fs::exists(f)) { std::printf( " %s (%.2f MB)\n", f.c_str(), static_cast(fs::file_size(f)) / (1024.0 * 1024.0)); } } + if (stages) { + std::printf("\n Stage Timing:\n"); + stages->print_stages(" "); + } std::printf("==========================================\n"); - co_return router_result.success ? 0 : 1; + co_return result.success ? 0 : 1; } } // namespace @@ -263,106 +1067,9 @@ int main(int argc, char** argv) { "Reorganize DFTracer trace files by routing events to " "predicate-based groups with chunked output."); - program.add_argument("--files") - .help("Input trace files (.pfw, .pfw.gz)") - .nargs(argparse::nargs_pattern::any) - .default_value>({}); - - program.add_argument("-d", "--directory") - .help("Directory containing trace files") - .default_value(""); - - program.add_argument("-o", "--output").help("Output directory").required(); - - program.add_argument("--groups") - .help( - "Predicate groups: \"io:cat==\\\"POSIX\\\"\" " - "\"compute:cat==\\\"APP\\\"\"") - .nargs(argparse::nargs_pattern::at_least_one) - .required(); - - program.add_argument("--chunk-size") - .help("Target chunk size in MB (default: 256)") - .scan<'d', std::size_t>() - .default_value(static_cast(256)); - - program.add_argument("--checkpoint-size") - .help("Checkpoint size for indexing in bytes") - .scan<'d', std::size_t>() - .default_value(static_cast( - indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE)); - - program.add_argument("--index-dir") - .help("Directory for .dftindex stores") - .default_value(""); - - program.add_argument("-f", "--force") - .help("Force rebuild of indices") - .flag(); - - program.add_argument("--no-compress") - .help("Write plain .pfw instead of .pfw.gz") - .flag(); - - program.add_argument("--executor-threads") - .help("Worker threads") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what()); - std::cerr << program; - return 1; - } - - std::string directory = program.get("--directory"); - std::string output_dir = program.get("--output"); - std::string index_dir = program.get("--index-dir"); - auto group_specs = program.get>("--groups"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - std::size_t chunk_size_mb = program.get("--chunk-size"); - bool force_rebuild = program.get("--force"); - bool no_compress = program.get("--no-compress"); - std::size_t executor_threads = - program.get("--executor-threads"); - - fs::create_directories(output_dir); - - auto groups = parse_group_specs(group_specs); - if (groups.empty()) { - DFTRACER_UTILS_LOG_ERROR("%s", "No groups specified."); - return 1; - } - - std::vector files; - if (!directory.empty()) { - if (!fs::exists(directory)) { - DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s", - directory.c_str()); - return 1; - } - filesystem::PatternDirectoryScannerUtility scanner; - filesystem::PatternDirectoryScannerUtilityInput scan_input{ - directory, {".pfw", ".pfw.gz"}, false}; - auto matched = scanner.process(scan_input).get(); - for (const auto& entry : matched) { - files.push_back(entry.path.string()); - } - } else { - files = program.get>("--files"); - } - - if (files.empty()) { - DFTRACER_UTILS_LOG_ERROR("%s", - "No input files. Use --files or --directory."); - return 1; - } + OrganizeArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; - return run_organize(output_dir, index_dir, files, groups, checkpoint_size, - force_rebuild, no_compress, executor_threads, - chunk_size_mb) - .get(); + return run_organize(&cli).get(); } diff --git a/src/dftracer/utils/binaries/dftracer_pgzip.cpp b/src/dftracer/utils/binaries/dftracer_pgzip.cpp index c84d7160..ac5a1791 100644 --- a/src/dftracer/utils/binaries/dftracer_pgzip.cpp +++ b/src/dftracer/utils/binaries/dftracer_pgzip.cpp @@ -1,16 +1,12 @@ #include -#include #include -#include #include #include #include -#include #include #include #include -#include #include #include #include @@ -18,8 +14,52 @@ #include #include +#include "common_cli.h" + using namespace dftracer::utils; +class PgzipArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{cli::DirMode::DEFAULT_DOT, + "Directory containing .pfw files"}; + cli::PipelineArgs pipeline; + cli::WatchdogArgs watchdog; + + bool verbose = false; + int compression_level = Z_DEFAULT_COMPRESSION; + std::size_t chunk_size = 4 * 1024 * 1024; + + explicit PgzipArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + schema(directory, pipeline, watchdog); + } + + protected: + void register_args() override { + parser() + .add_argument("-v", "--verbose") + .help("Enable verbose output") + .flag(); + + parser() + .add_argument("-l", "--compression-level") + .help("Compression level (0-9, default: Z_DEFAULT_COMPRESSION)") + .scan<'d', int>() + .default_value(Z_DEFAULT_COMPRESSION); + + parser() + .add_argument("--chunk-size") + .help("Chunk size in bytes for parallel compression (default: 4MB)") + .scan<'d', std::size_t>() + .default_value(static_cast(4 * 1024 * 1024)); + } + + void post_parse() override { + verbose = parser().get("--verbose"); + compression_level = parser().get("--compression-level"); + chunk_size = parser().get("--chunk-size"); + } +}; + namespace { struct FileResult { @@ -41,7 +81,6 @@ struct CompressedChunk { std::string data; }; -// Top-level coroutine: reads file and sends chunks to channel. static coro::CoroTask chunk_reader( coro::ChannelProducer producer, const std::string* file_path, std::size_t chunk_size) { @@ -67,9 +106,8 @@ static coro::CoroTask chunk_reader( co_return; } -// Top-level coroutine: compresses chunks and sends to output channel. static coro::CoroTask chunk_compressor( - std::shared_ptr> input_chan, + coro::ChannelConsumer input_chan, coro::ChannelProducer out_producer, int compression_level) { auto guard = out_producer.guard(); @@ -83,7 +121,7 @@ static coro::CoroTask chunk_compressor( std::string out_buf(64 * 1024, '\0'); - while (auto work = co_await input_chan->receive()) { + while (auto work = co_await input_chan.receive()) { std::string compressed; compressed.reserve(work->data.size()); @@ -117,9 +155,8 @@ static coro::CoroTask chunk_compressor( co_return; } -// Top-level coroutine: receives compressed chunks and writes in order. static coro::CoroTask chunk_writer( - std::shared_ptr> output_chan, + coro::ChannelConsumer output_chan, const std::string* output_path) { std::ofstream ofs(*output_path, std::ios::binary); if (!ofs.is_open()) co_return; @@ -127,7 +164,7 @@ static coro::CoroTask chunk_writer( std::size_t next_expected = 0; std::map pending; - while (auto chunk = co_await output_chan->receive()) { + while (auto chunk = co_await output_chan.receive()) { if (chunk->index == next_expected) { ofs.write(chunk->data.data(), static_cast(chunk->data.size())); @@ -150,7 +187,6 @@ static coro::CoroTask chunk_writer( co_return; } -// Compress a single file using parallel chunk compression. static coro::CoroTask compress_file_parallel( CoroScope& ctx, const std::string& file_path, int compression_level, std::size_t num_workers, std::size_t chunk_size) { @@ -182,27 +218,29 @@ static coro::CoroTask compress_file_parallel( const auto* file_path_ptr = &file_path; const auto* output_path_ptr = &result.output_path; - co_await ctx.scope([input_chan, output_chan, file_path_ptr, + co_await ctx.scope([&input_chan, &output_chan, file_path_ptr, output_path_ptr, compression_level, num_workers, chunk_size]( CoroScope& scope) -> coro::CoroTask { - scope.spawn([input_chan, file_path_ptr, - chunk_size](CoroScope&) -> coro::CoroTask { - co_await chunk_reader(input_chan->producer(), file_path_ptr, - chunk_size); + scope.spawn([ch = input_chan->producer(), file_path_ptr, + chunk_size]( + CoroScope&) mutable -> coro::CoroTask { + co_await chunk_reader(std::move(ch), file_path_ptr, chunk_size); }); for (std::size_t w = 0; w < num_workers; ++w) { - scope.spawn([input_chan, output_chan, compression_level]( - CoroScope&) -> coro::CoroTask { - co_await chunk_compressor( - input_chan, output_chan->producer(), compression_level); + scope.spawn([in_ch = input_chan->consumer(), + out_ch = output_chan->producer(), + compression_level]( + CoroScope&) mutable -> coro::CoroTask { + co_await chunk_compressor(in_ch, std::move(out_ch), + compression_level); }); } - scope.spawn([output_chan, + scope.spawn([ch = output_chan->consumer(), output_path_ptr](CoroScope&) -> coro::CoroTask { - co_await chunk_writer(output_chan, output_path_ptr); + co_await chunk_writer(ch, output_path_ptr); }); co_return; @@ -230,85 +268,12 @@ static coro::CoroTask compress_file_parallel( } // namespace -int main(int argc, char** argv) { - DFTRACER_UTILS_LOGGER_INIT(); - - argparse::ArgumentParser program("dftracer_pgzip", - DFTRACER_UTILS_PACKAGE_VERSION); - program.add_description( - "Parallel gzip compression for DFTracer .pfw files. " - "Splits each file into chunks and compresses them in parallel " - "as independent gzip members."); - - program.add_argument("-d", "--directory") - .help("Directory containing .pfw files") - .default_value("."); - - program.add_argument("-v", "--verbose") - .help("Enable verbose output") - .flag(); - - program.add_argument("--executor-threads") - .help("Number of worker threads (default: number of CPU cores)") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - program.add_argument("-l", "--compression-level") - .help("Compression level (0-9, default: Z_DEFAULT_COMPRESSION)") - .scan<'d', int>() - .default_value(Z_DEFAULT_COMPRESSION); - - program.add_argument("--chunk-size") - .help("Chunk size in bytes for parallel compression (default: 4MB)") - .scan<'d', std::size_t>() - .default_value(static_cast(4 * 1024 * 1024)); - - program.add_argument("--disable-watchdog") - .help("Disable watchdog for hang detection") - .flag(); - - program.add_argument("--watchdog-global-timeout") - .help("Watchdog global timeout in seconds (0 = no timeout)") - .scan<'d', int>() - .default_value(0); - - program.add_argument("--watchdog-task-timeout") - .help("Watchdog default task timeout in seconds (0 = no timeout)") - .scan<'d', int>() - .default_value(0); - - program.add_argument("--watchdog-idle-timeout") - .help("Watchdog idle timeout in seconds (0 = use default)") - .scan<'d', int>() - .default_value(300); - - program.add_argument("--watchdog-deadlock-timeout") - .help("Watchdog deadlock timeout in seconds (0 = use default)") - .scan<'d', int>() - .default_value(600); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what()); - std::cerr << program << std::endl; - return 1; - } - - std::string input_dir = program.get("--directory"); - bool verbose = program.get("--verbose"); - std::size_t executor_threads = - program.get("--executor-threads"); - int compression_level = program.get("--compression-level"); - std::size_t chunk_size = program.get("--chunk-size"); - bool disable_watchdog = program.get("--disable-watchdog"); - int global_timeout = program.get("--watchdog-global-timeout"); - int task_timeout = program.get("--watchdog-task-timeout"); - int idle_timeout = program.get("--watchdog-idle-timeout"); - int deadlock_timeout = program.get("--watchdog-deadlock-timeout"); - - input_dir = fs::absolute(input_dir).string(); +static int run_pgzip(const PgzipArgParse& cli) { + const auto input_dir = fs::absolute(cli.directory.value).string(); + const auto verbose = cli.verbose; + const auto executor_threads = cli.pipeline.executor_threads; + const auto compression_level = cli.compression_level; + const auto chunk_size = cli.chunk_size; std::vector input_files; for (const auto& entry : fs::directory_iterator(input_dir)) { @@ -338,17 +303,8 @@ int main(int argc, char** argv) { auto start_time = std::chrono::high_resolution_clock::now(); - auto pipeline_config = - PipelineConfig() - .with_name("DFTracer Parallel Gzip") - .with_compute_threads(executor_threads) - .with_watchdog(!disable_watchdog) - .with_global_timeout(std::chrono::seconds(global_timeout)) - .with_task_timeout(std::chrono::seconds(task_timeout)) - .with_executor_idle_timeout(std::chrono::seconds(idle_timeout)) - .with_executor_deadlock_timeout( - std::chrono::seconds(deadlock_timeout)); - + auto pipeline_config = cli::build_pipeline_config( + "DFTracer Parallel Gzip", cli.pipeline, cli.watchdog); Pipeline pipeline(pipeline_config); std::vector results; @@ -364,7 +320,7 @@ int main(int argc, char** argv) { auto file_chan = coro::make_channel(executor_threads * 2); - co_await ctx.scope([file_chan, files_ptr, results_ptr, mutex_ptr, + co_await ctx.scope([&file_chan, files_ptr, results_ptr, mutex_ptr, compression_level, executor_threads, chunk_size, verbose]( CoroScope& scope) -> coro::CoroTask { @@ -379,11 +335,11 @@ int main(int argc, char** argv) { }); for (std::size_t w = 0; w < executor_threads; ++w) { - scope.spawn([file_chan, files_ptr, results_ptr, mutex_ptr, - compression_level, executor_threads, - chunk_size, verbose]( + scope.spawn([ch = file_chan->consumer(), files_ptr, + results_ptr, mutex_ptr, compression_level, + executor_threads, chunk_size, verbose]( CoroScope& wctx) -> coro::CoroTask { - while (auto fi_opt = co_await file_chan->receive()) { + while (auto fi_opt = co_await ch.receive()) { const auto& path = (*files_ptr)[*fi_opt]; auto result = co_await compress_file_parallel( @@ -473,3 +429,20 @@ int main(int argc, char** argv) { return successful == input_files.size() ? 0 : 1; } + +int main(int argc, char** argv) { + DFTRACER_UTILS_LOGGER_INIT(); + + argparse::ArgumentParser program("dftracer_pgzip", + DFTRACER_UTILS_PACKAGE_VERSION); + program.add_description( + "Parallel gzip compression for DFTracer .pfw files. " + "Splits each file into chunks and compresses them in parallel " + "as independent gzip members."); + + PgzipArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + + return run_pgzip(cli); +} diff --git a/src/dftracer/utils/binaries/dftracer_reconstruct.cpp b/src/dftracer/utils/binaries/dftracer_reconstruct.cpp index f40d0db1..fede952f 100644 --- a/src/dftracer/utils/binaries/dftracer_reconstruct.cpp +++ b/src/dftracer/utils/binaries/dftracer_reconstruct.cpp @@ -1,330 +1,95 @@ #include -#include #include -#include -#include -#include #include -#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include -#include -#include #include #include -#include #include -#include -#include + +#include "common_cli.h" using namespace dftracer::utils; -using namespace dftracer::utils::utilities; -using namespace dftracer::utils::utilities::composites; -using namespace dftracer::utils::utilities::composites::dft; using namespace dftracer::utils::utilities::composites::dft::reorganize; -using dftracer::utils::utilities::fileio::ChunkWriter; -using dftracer::utils::utilities::fileio::ChunkWriterConfig; -namespace { +class ReconstructArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{cli::DirMode::REQUIRED, + "Directory containing reorganized files"}; + cli::PipelineArgs pipeline; -struct SegmentInterval { - int line_start; - int line_end; - std::string original_path; - int source_checkpoint; -}; + std::size_t checkpoint_size = 0; + std::string output_dir; + bool no_compress = false; -const SegmentInterval* find_segment( - const std::vector& intervals, int line_number) { - auto it = std::upper_bound( - intervals.begin(), intervals.end(), line_number, - [](int ln, const SegmentInterval& seg) { return ln < seg.line_start; }); - if (it != intervals.begin()) { - --it; - if (line_number >= it->line_start && line_number < it->line_end) { - return &(*it); - } + explicit ReconstructArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + schema(directory, pipeline); } - return nullptr; -} -std::string output_filename(const std::string& original_path) { - auto p = fs::path(original_path).filename().string(); - if (p.size() > 3 && p.substr(p.size() - 3) == ".gz") { - p = p.substr(0, p.size() - 3); + protected: + void register_args() override { + parser() + .add_argument("--checkpoint-size") + .help("Checkpoint size for gzip indexing in bytes (default: " + + std::to_string(constants::indexer::DEFAULT_CHECKPOINT_SIZE) + + ")") + .scan<'d', std::size_t>() + .default_value(static_cast( + constants::indexer::DEFAULT_CHECKPOINT_SIZE)); + + parser() + .add_argument("-o", "--output") + .help("Output directory") + .required(); + + parser() + .add_argument("--no-compress") + .help("Write plain .pfw instead of .pfw.gz") + .flag(); } - return p; -} -} // namespace + void post_parse() override { + checkpoint_size = parser().get("--checkpoint-size"); + output_dir = parser().get("--output"); + no_compress = parser().get("--no-compress"); + } +}; -static coro::CoroTask run_reconstruct(const std::string& directory, - const std::string& output_dir, - std::size_t checkpoint_size, - bool no_compress, - std::size_t executor_threads) { +static coro::CoroTask run_reconstruct(const ReconstructArgParse* cli, + CoroScope& scope) { std::printf("==========================================\n"); std::printf("DFTracer Trace Reconstructor\n"); std::printf("==========================================\n"); - - std::vector reorg_files; - if (fs::exists(directory)) { - filesystem::PatternDirectoryScannerUtility scanner; - filesystem::PatternDirectoryScannerUtilityInput scan_input{ - directory, {".pfw", ".pfw.gz"}, true}; - auto matched = co_await scanner.process(scan_input); - for (const auto& entry : matched) { - reorg_files.push_back(entry.path.string()); - } - } - - if (reorg_files.empty()) { - DFTRACER_UTILS_LOG_ERROR("%s", "No reorganized files found."); - co_return 1; - } - - std::printf(" Input directory: %s\n", directory.c_str()); - std::printf(" Reorganized files: %zu\n", reorg_files.size()); - std::printf(" Output directory: %s\n", output_dir.c_str()); + std::printf(" Input directory: %s\n", cli->directory.value.c_str()); + std::printf(" Output directory: %s\n", cli->output_dir.c_str()); + std::printf(" Compress: %s\n", cli->no_compress ? "false" : "true"); + std::printf(" Executor threads: %zu\n", cli->pipeline.executor_threads); + std::printf("==========================================\n\n"); auto start_time = std::chrono::high_resolution_clock::now(); - std::printf("\nStep 1: Building reconstruction plan...\n"); - ReconstructionPlannerUtility planner; - ReconstructionPlannerInput planner_input; - planner_input.reorganized_files = reorg_files; - planner_input.index_dir = ""; + ReconstructorInput input; + input.input_dir = cli->directory.value; + input.output_dir = cli->output_dir; + input.checkpoint_size = cli->checkpoint_size; + input.parallelism = cli->pipeline.executor_threads; + input.compress = !cli->no_compress; - ReconstructionPlan plan; - try { - plan = co_await planner.process(planner_input); - } catch (const std::exception& e) { - DFTRACER_UTILS_LOG_ERROR("Planning failed: %s", e.what()); - co_return 1; - } - - if (plan.files.empty()) { - std::printf("No files with provenance found.\n"); - co_return 0; - } - - std::printf(" Original files to reconstruct: %zu\n", plan.files.size()); - std::printf(" Total segments: %zu\n", plan.total_segments); - std::printf(" Total events: %zu\n", plan.total_events); - - // Step 2: For each reorganized file, extract lines and write directly - // to output via ChunkWriter (streaming, no full buffering) - std::printf("\nStep 2: Extracting and writing...\n"); - - // Build per-reorg-file segment intervals - std::unordered_map> - per_reorg_segments; - for (const auto& [orig_path, recon] : plan.files) { - for (const auto& [ckpt, segs] : recon.checkpoint_segments) { - for (const auto& seg : segs) { - SegmentInterval si; - si.line_start = seg.output_line_start; - si.line_end = seg.output_line_end; - si.original_path = orig_path; - si.source_checkpoint = seg.source_checkpoint; - per_reorg_segments[seg.reorg_file].push_back(std::move(si)); - } - } - } - - for (auto& [file, segs] : per_reorg_segments) { - std::sort(segs.begin(), segs.end(), - [](const SegmentInterval& a, const SegmentInterval& b) { - return a.line_start < b.line_start; - }); - } - - std::unordered_map> writers; - std::unordered_map> - writer_mutexes; - for (const auto& [orig_path, recon] : plan.files) { - std::string fname = output_filename(orig_path); - std::string base = fname; - if (base.size() > 4 && base.substr(base.size() - 4) == ".pfw") { - base = base.substr(0, base.size() - 4); - } - - auto config = - ChunkWriterConfig() - .with_output_dir(output_dir) - .with_base_name(base) - .with_chunk_size(std::numeric_limits::max()) - .with_compression(!no_compress); - writers[orig_path] = std::make_unique(config); - writer_mutexes[orig_path] = std::make_unique(); - } + ReconstructorUtility reconstructor; + auto result = co_await scope.spawn(reconstructor, std::move(input)); - for (auto& [path, writer] : writers) { - co_await writer->open(); - } - - // Process reorganized files (parallel via pipeline) - { - auto pipeline_config = PipelineConfig() - .with_name("Reconstruct: Extract") - .with_compute_threads(executor_threads) - .with_watchdog(false); - Pipeline pipeline(pipeline_config); - - auto* per_reorg_ptr = &per_reorg_segments; - auto* writers_ptr = &writers; - auto* mutexes_ptr = &writer_mutexes; - - auto extract_task = make_task( - [per_reorg_ptr, writers_ptr, mutexes_ptr, checkpoint_size, - executor_threads](CoroScope& scope) -> coro::CoroTask { - auto permits = coro::make_channel(executor_threads * 2); - for (std::size_t i = 0; i < executor_threads * 2; ++i) { - permits->try_send(true); - } - - std::vector> futures; - - for (const auto& [reorg_file, intervals] : *per_reorg_ptr) { - auto* intervals_ptr = &intervals; - auto reorg_file_copy = reorg_file; - futures.push_back(scope.spawn([reorg_file_copy, - intervals_ptr, writers_ptr, - mutexes_ptr, checkpoint_size, - permits](CoroScope& s) - -> coro::CoroTask { - co_await s.receive(permits); - try { - std::string index_path = - internal::determine_index_path(reorg_file_copy, - ""); - - MetadataCollectorUtility meta_collector; - auto meta_input = - MetadataCollectorUtilityInput::from_file( - reorg_file_copy) - .with_index(index_path) - .with_checkpoint_size(checkpoint_size); - auto meta = - co_await meta_collector.process(meta_input); - - auto reader_input = - IndexedReadInput::from_file(reorg_file_copy) - .with_index(index_path) - .with_checkpoint_size(checkpoint_size); - IndexedFileReaderUtility reader_utility; - auto reader = - co_await reader_utility.process(reader_input); - - auto stream = reader->stream( - reader::internal::StreamConfig() - .stream_type(reader::internal::StreamType:: - MULTI_LINES_BYTES) - .range_type( - reader::internal::RangeType::BYTE_RANGE) - .buffer_size(4 * 1024 * 1024) - .from(0) - .to(meta.uncompressed_size)); - - struct PendingLine { - const char* data; - std::size_t len; - }; - std::unordered_map> - batch; - int event_number = 0; - - while (!stream->done()) { - auto chunk = co_await stream->read_async(); - if (chunk.empty()) break; - - const char* data = chunk.data(); - std::size_t bytes_read = chunk.size(); - std::size_t pos = 0; - - while (pos < bytes_read) { - const char* line_start = data + pos; - const char* newline = - static_cast( - std::memchr(line_start, '\n', - bytes_read - pos)); - if (!newline) break; - std::size_t line_len = - static_cast(newline - - line_start); - - if (line_len > 0 && line_start[0] == '{') { - const auto* seg = find_segment( - *intervals_ptr, event_number); - if (seg) { - batch[seg->original_path].push_back( - {line_start, line_len}); - } - event_number++; - } - - pos = static_cast(newline - - data) + - 1; - } - - for (auto& [orig, lines] : batch) { - if (lines.empty()) continue; - auto wit = writers_ptr->find(orig); - auto mit = mutexes_ptr->find(orig); - if (wit != writers_ptr->end() && - mit != mutexes_ptr->end()) { - co_await mit->second->lock(); - for (const auto& l : lines) { - co_await wit->second->write_line( - ByteView(l.data, l.len)); - } - mit->second->unlock(); - } - lines.clear(); - } - } - - permits->try_send(true); - } catch (...) { - permits->try_send(true); - throw; - } - })); - } - - for (auto& f : futures) { - co_await f; - } - }, - "ExtractLines"); - - pipeline.set_source(extract_task); - pipeline.set_destination(extract_task); - pipeline.execute(); + if (!result.success) { + DFTRACER_UTILS_LOG_ERROR("Reconstruction failed: %s", + result.error_message.c_str()); + co_return 1; } - // Close all writers - std::size_t files_written = 0; - for (auto& [path, writer] : writers) { - co_await writer->close(); - std::string fname = output_filename(path); - std::printf(" %s: %zu events\n", fname.c_str(), - writer->total_events_written()); - files_written++; + for (const auto& file : result.files) { + std::string fname = fs::path(file.output_path).filename().string(); + std::printf(" %s: %zu events\n", fname.c_str(), file.events_written); } auto end_time = std::chrono::high_resolution_clock::now(); @@ -334,7 +99,8 @@ static coro::CoroTask run_reconstruct(const std::string& directory, std::printf("Reconstruction Complete\n"); std::printf("==========================================\n"); std::printf(" Time: %.2f seconds\n", duration.count() / 1000.0); - std::printf(" Files reconstructed: %zu\n", files_written); + std::printf(" Files reconstructed: %zu\n", result.files.size()); + std::printf(" Total events: %zu\n", result.total_events); std::printf("==========================================\n"); co_return 0; @@ -348,46 +114,27 @@ int main(int argc, char** argv) { program.add_description( "Reconstruct original trace files from reorganized output."); - program.add_argument("-d", "--directory") - .help("Directory containing reorganized files") - .required(); - - program.add_argument("-o", "--output").help("Output directory").required(); + ReconstructArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; - program.add_argument("--checkpoint-size") - .help("Checkpoint size for indexing in bytes") - .scan<'d', std::size_t>() - .default_value(static_cast( - indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE)); + fs::create_directories(cli.output_dir); - program.add_argument("--no-compress") - .help("Write plain .pfw instead of .pfw.gz") - .flag(); - - program.add_argument("--executor-threads") - .help("Worker threads") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what()); - std::cerr << program; - return 1; - } + auto pipeline_config = + cli::build_pipeline_config("Reconstruct", cli.pipeline); + Pipeline pipeline(pipeline_config); - std::string directory = program.get("--directory"); - std::string output_dir = program.get("--output"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - bool no_compress = program.get("--no-compress"); - std::size_t executor_threads = - program.get("--executor-threads"); + int exit_code = 0; + auto* cli_ptr = &cli; + auto task = make_task( + [cli_ptr, &exit_code](CoroScope& scope) -> coro::CoroTask { + exit_code = co_await run_reconstruct(cli_ptr, scope); + }, + "ReconstructMain"); - fs::create_directories(output_dir); + pipeline.set_source(task); + pipeline.set_destination(task); + pipeline.execute(); - return run_reconstruct(directory, output_dir, checkpoint_size, no_compress, - executor_threads) - .get(); + return exit_code; } diff --git a/src/dftracer/utils/binaries/dftracer_replay.cpp b/src/dftracer/utils/binaries/dftracer_replay.cpp index b8d37a6d..8eae75b7 100644 --- a/src/dftracer/utils/binaries/dftracer_replay.cpp +++ b/src/dftracer/utils/binaries/dftracer_replay.cpp @@ -1,7 +1,23 @@ +// Pipeline-driven replay binary. +// +// DAG: +// scan -> execute +// +// scan : enumerate inputs (.pfw / .pfw.gz, recursive optional) +// execute : either ReplayEngine::run_pipelined (producer+consumer with +// Channel so read/parse latency is hidden behind the +// consumer's apply_timing+execute) or, when --use-call-tree is +// set, the legacy replay_with_call_tree path that builds a +// hierarchical tree first + #include #include #include +#include #include +#include +#include +#include #include #include @@ -11,548 +27,566 @@ #endif #include +#include +#include #include -#include -#include #include +#include +#include + +#include "common_cli.h" using namespace dftracer::utils; +using namespace dftracer::utils::utilities; using namespace dftracer::utils::utilities::replay; -/** - * Collect trace files from directory or file list - */ -static std::vector collect_trace_files( - const std::vector& inputs, bool recursive) { - std::vector trace_files; +namespace { + +class ReplayArgParse : public cli::ArgParse { + public: + cli::PipelineArgs pipeline; + + std::vector inputs; + bool no_timing = false; + bool dry_run = false; + bool dftracer_mode = false; + bool no_sleep = false; + bool verbose = false; + bool recursive = false; + bool use_call_tree = false; + bool hierarchical_replay = false; + bool respect_call_hierarchy = false; + + std::string filter_pid_csv; + std::string exclude_pid_csv; + std::string filter_tid_csv; + std::string exclude_tid_csv; + std::string filter_function_csv; + std::string exclude_function_csv; + std::string filter_category_csv; + std::string exclude_category_csv; + + std::uint64_t start_timestamp = 0; + std::uint64_t end_timestamp = UINT64_MAX; + std::int64_t min_size = -1; + std::int64_t max_size = -1; + double sample_rate = 1.0; + std::uint64_t sample_seed = 0; + std::size_t max_events = 0; + std::size_t channel_capacity = 4096; + + explicit ReplayArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + schema(pipeline); + } - for (const auto& input : inputs) { - if (fs::is_directory(input)) { - if (recursive) { - for (const auto& entry : - fs::recursive_directory_iterator(input)) { - if (entry.is_regular_file()) { - std::string path = entry.path().string(); - if ((path.size() >= 4 && - path.substr(path.size() - 4) == ".pfw") || - (path.size() >= 7 && - path.substr(path.size() - 7) == ".pfw.gz")) { - trace_files.push_back(path); - } - } - } - } else { - for (const auto& entry : fs::directory_iterator(input)) { - if (entry.is_regular_file()) { - std::string path = entry.path().string(); - if ((path.size() >= 4 && - path.substr(path.size() - 4) == ".pfw") || - (path.size() >= 7 && - path.substr(path.size() - 7) == ".pfw.gz")) { - trace_files.push_back(path); - } - } - } - } - } else if (fs::is_regular_file(input)) { - trace_files.push_back(input); - } else { - DFTRACER_UTILS_LOG_ERROR("Input not found or not accessible: %s", - input.c_str()); + protected: + void register_args() override { + auto& p = parser(); + p.add_argument("inputs") + .help( + "Trace files (.pfw, .pfw.gz) or directories containing trace " + "files") + .nargs(argparse::nargs_pattern::at_least_one); + + p.add_argument("--no-timing") + .help("Ignore original timing and execute as fast as possible") + .flag(); + p.add_argument("--dry-run") + .help("Parse and analyze traces without executing operations") + .flag(); + p.add_argument("--dftracer-mode") + .help( + "Use DFTracer sleep-based replay (sleep for operation " + "duration instead of doing actual I/O)") + .flag(); + p.add_argument("--no-sleep") + .help( + "When used with --dftracer-mode, disable sleep calls for " + "maximum speed") + .flag(); + p.add_argument("--verbose") + .help("Enable verbose output and detailed statistics") + .flag(); + p.add_argument("-r", "--recursive") + .help("Recursively search directories for trace files") + .flag(); + + p.add_argument("--use-call-tree") + .help("Build and use call tree structure for hierarchical replay") + .flag(); + p.add_argument("--hierarchical-replay") + .help( + "Replay operations respecting parent-child call hierarchy " + "(requires --use-call-tree)") + .flag(); + p.add_argument("--respect-call-hierarchy") + .help( + "Replay child nodes immediately after parent (requires " + "--use-call-tree and --hierarchical-replay)") + .flag(); + + p.add_argument("--filter-pid") + .help("Only replay events from specific PID(s) (comma-separated)") + .default_value(std::string("")); + p.add_argument("--exclude-pid") + .help("Exclude events from specific PID(s) (comma-separated)") + .default_value(std::string("")); + p.add_argument("--filter-tid") + .help("Only replay events from specific TID(s) (comma-separated)") + .default_value(std::string("")); + p.add_argument("--exclude-tid") + .help("Exclude events from specific TID(s) (comma-separated)") + .default_value(std::string("")); + p.add_argument("--filter-function") + .help( + "Only replay specific function(s) (comma-separated, e.g., " + "'read,write,open')") + .default_value(std::string("")); + p.add_argument("--exclude-function") + .help("Exclude specific function(s) (comma-separated)") + .default_value(std::string("")); + p.add_argument("--filter-category") + .help( + "Only replay specific category/categories (comma-separated, " + "e.g., 'POSIX,storage')") + .default_value(std::string("")); + p.add_argument("--exclude-category") + .help("Exclude specific category/categories (comma-separated)") + .default_value(std::string("")); + + p.add_argument("--start-timestamp") + .help("Only replay events after this timestamp (microseconds)") + .default_value(std::uint64_t(0)) + .scan<'u', std::uint64_t>(); + p.add_argument("--end-timestamp") + .help("Only replay events before this timestamp (microseconds)") + .default_value(UINT64_MAX) + .scan<'u', std::uint64_t>(); + p.add_argument("--min-size") + .help("Only replay operations with size >= this value (bytes)") + .default_value(std::int64_t(-1)) + .scan<'i', std::int64_t>(); + p.add_argument("--max-size") + .help("Only replay operations with size <= this value (bytes)") + .default_value(std::int64_t(-1)) + .scan<'i', std::int64_t>(); + + p.add_argument("--sample-rate") + .help("Sample rate for replay (0.0-1.0, 1.0=all events, 0.1=10%)") + .default_value(1.0) + .scan<'g', double>(); + p.add_argument("--sample-seed") + .help("Random seed for sampling (for reproducibility)") + .default_value(std::uint64_t(0)) + .scan<'u', std::uint64_t>(); + p.add_argument("--max-events") + .help("Maximum number of events to replay (0=unlimited)") + .default_value(std::size_t(0)) + .scan<'u', std::size_t>(); + p.add_argument("--channel-capacity") + .help( + "Bounded Channel capacity between read/parse producer " + "and dispatch consumer (default 4096)") + .default_value(std::size_t(4096)) + .scan<'u', std::size_t>(); + } + + void post_parse() override { + auto& p = parser(); + inputs = p.get>("inputs"); + no_timing = p.get("--no-timing"); + dry_run = p.get("--dry-run"); + dftracer_mode = p.get("--dftracer-mode"); + no_sleep = p.get("--no-sleep"); + verbose = p.get("--verbose"); + recursive = p.get("--recursive"); + use_call_tree = p.get("--use-call-tree"); + hierarchical_replay = p.get("--hierarchical-replay"); + respect_call_hierarchy = p.get("--respect-call-hierarchy"); + + filter_pid_csv = p.get("--filter-pid"); + exclude_pid_csv = p.get("--exclude-pid"); + filter_tid_csv = p.get("--filter-tid"); + exclude_tid_csv = p.get("--exclude-tid"); + filter_function_csv = p.get("--filter-function"); + exclude_function_csv = p.get("--exclude-function"); + filter_category_csv = p.get("--filter-category"); + exclude_category_csv = p.get("--exclude-category"); + + start_timestamp = p.get("--start-timestamp"); + end_timestamp = p.get("--end-timestamp"); + min_size = p.get("--min-size"); + max_size = p.get("--max-size"); + sample_rate = p.get("--sample-rate"); + sample_seed = p.get("--sample-seed"); + max_events = p.get("--max-events"); + channel_capacity = p.get("--channel-capacity"); + } + + bool validate() override { + if (no_sleep && !dftracer_mode) { + std::fprintf(stderr, + "Error: --no-sleep can only be used with " + "--dftracer-mode\n"); + return false; + } + if (hierarchical_replay && !use_call_tree) { + std::fprintf( + stderr, + "Error: --hierarchical-replay requires --use-call-tree\n"); + return false; + } + if (respect_call_hierarchy && !hierarchical_replay) { + std::fprintf(stderr, + "Error: --respect-call-hierarchy requires " + "--hierarchical-replay\n"); + return false; + } + if (sample_rate < 0.0 || sample_rate > 1.0) { + std::fprintf(stderr, + "Error: --sample-rate must be between 0.0 and 1.0\n"); + return false; } + return true; } +}; - return trace_files; +bool is_trace_file(const std::string& path) { + return (path.size() >= 4 && + path.compare(path.size() - 4, 4, ".pfw") == 0) || + (path.size() >= 7 && + path.compare(path.size() - 7, 7, ".pfw.gz") == 0); } -int main(int argc, char** argv) { -#ifdef DFTRACER_UTILS_MPI_ENABLED - MPI_Init(&argc, &argv); - mpi::MPIUtils::instance().initialize(); -#endif +std::unordered_set parse_csv_uint32(const std::string& csv) { + std::unordered_set out; + if (csv.empty()) return out; + std::istringstream ss(csv); + std::string token; + while (std::getline(ss, token, ',')) { + if (!token.empty()) { + try { + out.insert(static_cast(std::stoul(token))); + } catch (...) { + } + } + } + return out; +} - DFTRACER_UTILS_LOGGER_INIT(); +std::unordered_set parse_csv_string(const std::string& csv) { + std::unordered_set out; + if (csv.empty()) return out; + std::istringstream ss(csv); + std::string token; + while (std::getline(ss, token, ',')) { + if (!token.empty()) out.insert(token); + } + return out; +} - // Get MPI rank for output control (defaults to rank 0, size 1 without MPI) +struct RunCtx { + const ReplayArgParse* cli = nullptr; int mpi_rank = 0; int mpi_size = 1; bool is_root = true; -#ifdef DFTRACER_UTILS_MPI_ENABLED - mpi_rank = mpi::MPIUtils::instance().get_rank(); - mpi_size = mpi::MPIUtils::instance().get_world_size(); - is_root = mpi::MPIUtils::instance().is_root(); -#endif - argparse::ArgumentParser program("dftracer_replay", - DFTRACER_UTILS_PACKAGE_VERSION); - program.add_description( - "DFTracer replay utility - replays I/O operations from DFTracer trace " - "files (.pfw, .pfw.gz)"); - - // Input files/directories - program.add_argument("inputs") - .help( - "Trace files (.pfw, .pfw.gz) or directories containing trace files") - .nargs(argparse::nargs_pattern::at_least_one); - - // Timing options - program.add_argument("--no-timing") - .help("Ignore original timing and execute as fast as possible") - .flag(); - - // Execution options - program.add_argument("--dry-run") - .help("Parse and analyze traces without executing operations") - .flag(); - - program.add_argument("--dftracer-mode") - .help( - "Use DFTracer sleep-based replay (sleep for operation duration " - "instead of doing actual I/O)") - .flag(); - - program.add_argument("--no-sleep") - .help( - "When used with --dftracer-mode, disable sleep calls for maximum " - "speed") - .flag(); - - program.add_argument("--verbose") - .help("Enable verbose output and detailed statistics") - .flag(); - - program.add_argument("-r", "--recursive") - .help("Recursively search directories for trace files") - .flag(); - - // Call tree options - program.add_argument("--use-call-tree") - .help("Build and use call tree structure for hierarchical replay") - .flag(); - - program.add_argument("--hierarchical-replay") - .help( - "Replay operations respecting parent-child call hierarchy " - "(requires --use-call-tree)") - .flag(); - - program.add_argument("--respect-call-hierarchy") - .help( - "Replay child nodes immediately after parent (requires " - "--use-call-tree and --hierarchical-replay)") - .flag(); - - // Filtering options - Process/Thread - program.add_argument("--filter-pid") - .help("Only replay events from specific PID(s) (comma-separated)") - .default_value(std::string("")); - - program.add_argument("--exclude-pid") - .help("Exclude events from specific PID(s) (comma-separated)") - .default_value(std::string("")); - - program.add_argument("--filter-tid") - .help("Only replay events from specific TID(s) (comma-separated)") - .default_value(std::string("")); - - program.add_argument("--exclude-tid") - .help("Exclude events from specific TID(s) (comma-separated)") - .default_value(std::string("")); - - // Filtering options - Function/Category - program.add_argument("--filter-function") - .help( - "Only replay specific function(s) (comma-separated, e.g., " - "'read,write,open')") - .default_value(std::string("")); - - program.add_argument("--exclude-function") - .help("Exclude specific function(s) (comma-separated)") - .default_value(std::string("")); - - program.add_argument("--filter-category") - .help( - "Only replay specific category/categories (comma-separated, e.g., " - "'POSIX,storage')") - .default_value(std::string("")); - - program.add_argument("--exclude-category") - .help("Exclude specific category/categories (comma-separated)") - .default_value(std::string("")); - - // Filtering options - Timestamp - program.add_argument("--start-timestamp") - .help("Only replay events after this timestamp (microseconds)") - .default_value(std::uint64_t(0)) - .scan<'u', std::uint64_t>(); - - program.add_argument("--end-timestamp") - .help("Only replay events before this timestamp (microseconds)") - .default_value(UINT64_MAX) - .scan<'u', std::uint64_t>(); - - // Filtering options - Size - program.add_argument("--min-size") - .help("Only replay operations with size >= this value (bytes)") - .default_value(std::int64_t(-1)) - .scan<'i', std::int64_t>(); - - program.add_argument("--max-size") - .help("Only replay operations with size <= this value (bytes)") - .default_value(std::int64_t(-1)) - .scan<'i', std::int64_t>(); - - // Sampling options - program.add_argument("--sample-rate") - .help("Sample rate for replay (0.0-1.0, 1.0=all events, 0.1=10%)") - .default_value(1.0) - .scan<'g', double>(); - - program.add_argument("--sample-seed") - .help("Random seed for sampling (for reproducibility)") - .default_value(std::uint64_t(0)) - .scan<'u', std::uint64_t>(); - - // Resource limits - program.add_argument("--max-events") - .help("Maximum number of events to replay (0=unlimited)") - .default_value(std::size_t(0)) - .scan<'u', std::size_t>(); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Argument parsing error: %s", err.what()); - std::cerr << program; - return 1; - } - - // Helper to parse comma-separated values - auto parse_csv_uint32 = - [](const std::string& csv) -> std::unordered_set { - std::unordered_set result; - if (csv.empty()) return result; - - std::istringstream ss(csv); - std::string token; - while (std::getline(ss, token, ',')) { - if (!token.empty()) { - result.insert(static_cast(std::stoul(token))); + ReplayConfig config; + std::vector trace_files; + ReplayResult result; + int exit_code = 0; + bool failed = false; + + double scan_ms = 0; + double execute_ms = 0; +}; + +coro::CoroTask task_scan(RunCtx* ctx) { + const auto t0 = std::chrono::steady_clock::now(); + + for (const auto& in : ctx->cli->inputs) { + std::error_code ec; + if (fs::is_directory(in, ec)) { + if (ctx->cli->recursive) { + for (const auto& e : fs::recursive_directory_iterator(in, ec)) { + if (e.is_regular_file(ec) && + is_trace_file(e.path().string())) { + ctx->trace_files.push_back(e.path().string()); + } + } + } else { + for (const auto& e : fs::directory_iterator(in, ec)) { + if (e.is_regular_file(ec) && + is_trace_file(e.path().string())) { + ctx->trace_files.push_back(e.path().string()); + } + } } + } else if (fs::is_regular_file(in, ec)) { + ctx->trace_files.push_back(in); + } else { + DFTRACER_UTILS_LOG_ERROR("Input not found or not accessible: %s", + in.c_str()); } - return result; - }; + } + std::sort(ctx->trace_files.begin(), ctx->trace_files.end()); - auto parse_csv_string = - [](const std::string& csv) -> std::unordered_set { - std::unordered_set result; - if (csv.empty()) return result; + ctx->scan_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); - std::istringstream ss(csv); - std::string token; - while (std::getline(ss, token, ',')) { - if (!token.empty()) { - result.insert(token); - } + if (ctx->trace_files.empty()) { + if (ctx->is_root) { + std::fprintf(stderr, + "No trace files found in the specified inputs.\n"); } - return result; - }; - - // Parse arguments - std::vector inputs = - program.get>("inputs"); - bool no_timing = program.get("--no-timing"); - bool dry_run = program.get("--dry-run"); - bool dftracer_mode = program.get("--dftracer-mode"); - bool no_sleep = program.get("--no-sleep"); - bool verbose = program.get("--verbose"); - bool recursive = program.get("--recursive"); - - // Call tree options - bool use_call_tree = program.get("--use-call-tree"); - bool hierarchical_replay = program.get("--hierarchical-replay"); - bool respect_call_hierarchy = program.get("--respect-call-hierarchy"); - - // Parse filter arguments - auto filter_pids = - parse_csv_uint32(program.get("--filter-pid")); - auto exclude_pids = - parse_csv_uint32(program.get("--exclude-pid")); - auto filter_tids = - parse_csv_uint32(program.get("--filter-tid")); - auto exclude_tids = - parse_csv_uint32(program.get("--exclude-tid")); - auto filter_functions = - parse_csv_string(program.get("--filter-function")); - auto exclude_functions = - parse_csv_string(program.get("--exclude-function")); - auto filter_categories = - parse_csv_string(program.get("--filter-category")); - auto exclude_categories = - parse_csv_string(program.get("--exclude-category")); - - std::uint64_t start_timestamp = - program.get("--start-timestamp"); - std::uint64_t end_timestamp = program.get("--end-timestamp"); - std::int64_t min_size = program.get("--min-size"); - std::int64_t max_size = program.get("--max-size"); - double sample_rate = program.get("--sample-rate"); - std::uint64_t sample_seed = program.get("--sample-seed"); - std::size_t max_events = program.get("--max-events"); - - // Validate --no-sleep usage - if (no_sleep && !dftracer_mode) { - std::cerr << "Error: --no-sleep can only be used with --dftracer-mode" - << std::endl; - return 1; + ctx->failed = true; + ctx->exit_code = 1; + co_return; } - // Validate call tree options - if (hierarchical_replay && !use_call_tree) { - std::cerr << "Error: --hierarchical-replay requires --use-call-tree" - << std::endl; - return 1; + if (ctx->is_root) { + std::printf("Found %zu trace file(s) to replay:\n", + ctx->trace_files.size()); + for (const auto& file : ctx->trace_files) { + std::printf(" %s\n", file.c_str()); + } } + co_return; +} - if (respect_call_hierarchy && !hierarchical_replay) { - std::cerr - << "Error: --respect-call-hierarchy requires --hierarchical-replay" - << std::endl; - return 1; +void print_configuration(const RunCtx& ctx) { + const auto& c = ctx.config; + std::printf("\n=== Replay Configuration ===\n"); + if (ctx.mpi_size > 1) { + std::printf("MPI processes: %d\n", ctx.mpi_size); } - - // Validate sample rate - if (sample_rate < 0.0 || sample_rate > 1.0) { - std::cerr << "Error: --sample-rate must be between 0.0 and 1.0" - << std::endl; - return 1; + std::printf("Maintain timing: %s\n", c.maintain_timing ? "yes" : "no"); + std::printf("Dry run: %s\n", c.dry_run ? "yes" : "no"); + if (c.dftracer_mode) { + std::printf("DFTracer mode: yes (%s)\n", + c.no_sleep ? "no-sleep" : "sleep-based"); + } else { + std::printf("DFTracer mode: no (actual I/O)\n"); } + if (c.use_call_tree) { + std::printf("Call tree mode: yes\n"); + std::printf(" Hierarchical replay: %s\n", + c.hierarchical_replay ? "yes" : "no"); + if (c.hierarchical_replay) { + std::printf(" Respect call hierarchy: %s\n", + c.respect_call_hierarchy ? "yes" : "no"); + } + } +} - // Collect trace files - std::vector trace_files = - collect_trace_files(inputs, recursive); +void print_filters(const RunCtx& ctx) { + const auto& c = ctx.config; + bool any = !c.filter_pids.empty() || !c.exclude_pids.empty() || + !c.filter_tids.empty() || !c.exclude_tids.empty() || + !c.filter_functions.empty() || !c.exclude_functions.empty() || + !c.filter_categories.empty() || !c.exclude_categories.empty() || + c.start_timestamp > 0 || c.end_timestamp < UINT64_MAX || + c.min_operation_size >= 0 || c.max_operation_size >= 0 || + c.sampling_rate < 1.0 || c.max_events > 0; + if (!any) return; + + std::printf("\nActive Filters:\n"); + auto print_uint_set = [](const char* label, + const std::unordered_set& s) { + if (s.empty()) return; + std::printf(" %s:", label); + for (auto v : s) std::printf(" %u", v); + std::printf("\n"); + }; + auto print_str_set = [](const char* label, + const std::unordered_set& s) { + if (s.empty()) return; + std::printf(" %s:", label); + for (const auto& v : s) std::printf(" %s", v.c_str()); + std::printf("\n"); + }; + print_uint_set("Filter PIDs", c.filter_pids); + print_uint_set("Exclude PIDs", c.exclude_pids); + print_uint_set("Filter TIDs", c.filter_tids); + print_uint_set("Exclude TIDs", c.exclude_tids); + print_str_set("Filter functions", c.filter_functions); + print_str_set("Exclude functions", c.exclude_functions); + print_str_set("Filter categories", c.filter_categories); + print_str_set("Exclude categories", c.exclude_categories); + if (c.start_timestamp > 0) + std::printf(" Start timestamp: %" PRIu64 "\n", c.start_timestamp); + if (c.end_timestamp < UINT64_MAX) + std::printf(" End timestamp: %" PRIu64 "\n", c.end_timestamp); + if (c.min_operation_size >= 0) + std::printf(" Min operation size: %" PRId64 " bytes\n", + c.min_operation_size); + if (c.max_operation_size >= 0) + std::printf(" Max operation size: %" PRId64 " bytes\n", + c.max_operation_size); + if (c.sampling_rate < 1.0) + std::printf(" Sampling rate: %g%%\n", c.sampling_rate * 100.0); + if (c.max_events > 0) std::printf(" Max events: %zu\n", c.max_events); +} - if (trace_files.empty()) { - if (is_root) - std::cerr << "No trace files found in the specified inputs." - << std::endl; -#ifdef DFTRACER_UTILS_MPI_ENABLED - mpi::MPIUtils::instance().finalize(); - MPI_Finalize(); -#endif - return 1; - } +coro::CoroTask task_execute(RunCtx* ctx, CoroScope* scope) { + if (ctx->failed) co_return; - if (is_root) { - std::cout << "Found " << trace_files.size() - << " trace file(s) to replay:" << std::endl; - for (const auto& file : trace_files) { - std::cout << " " << file << std::endl; - } + const auto t0 = std::chrono::steady_clock::now(); + + if (ctx->is_root) { + print_configuration(*ctx); + print_filters(*ctx); + std::printf("\n=== Starting Replay ===\n"); } - // Configure replay - ReplayConfig config; - config.maintain_timing = !no_timing; - config.dry_run = dry_run; - config.dftracer_mode = dftracer_mode; - config.no_sleep = no_sleep; - config.verbose = verbose; - - // Store MPI info in config - config.mpi_rank = mpi_rank; - config.mpi_size = mpi_size; - - // Call tree options - config.use_call_tree = use_call_tree; - config.hierarchical_replay = hierarchical_replay; - config.respect_call_hierarchy = respect_call_hierarchy; - - // Apply filters - config.filter_pids = filter_pids; - config.exclude_pids = exclude_pids; - config.filter_tids = filter_tids; - config.exclude_tids = exclude_tids; - config.filter_functions = filter_functions; - config.exclude_functions = exclude_functions; - config.filter_categories = filter_categories; - config.exclude_categories = exclude_categories; - - // Apply ranges and limits - config.start_timestamp = start_timestamp; - config.end_timestamp = end_timestamp; - config.min_operation_size = min_size; - config.max_operation_size = max_size; - config.sampling_rate = sample_rate; - config.sample_seed = sample_seed; - config.max_events = max_events; - - // Print configuration (only on rank 0) - if (is_root) { - std::cout << "\n=== Replay Configuration ===" << std::endl; - if (mpi_size > 1) { - std::cout << "MPI processes: " << mpi_size << std::endl; - } - std::cout << "Maintain timing: " - << (config.maintain_timing ? "yes" : "no") << std::endl; - std::cout << "Dry run: " << (config.dry_run ? "yes" : "no") - << std::endl; - if (config.dftracer_mode) { - std::cout << "DFTracer mode: yes (" - << (config.no_sleep ? "no-sleep" : "sleep-based") << ")" - << std::endl; - } else { - std::cout << "DFTracer mode: no (actual I/O)" << std::endl; - } - if (config.use_call_tree) { - std::cout << "Call tree mode: yes" << std::endl; - std::cout << " Hierarchical replay: " - << (config.hierarchical_replay ? "yes" : "no") - << std::endl; - if (config.hierarchical_replay) { - std::cout << " Respect call hierarchy: " - << (config.respect_call_hierarchy ? "yes" : "no") - << std::endl; + ReplayEngine engine(ctx->config); + + if (ctx->config.use_call_tree) { + // Call tree path stays sync — replay_with_call_tree builds the + // full tree in memory first, so there's nothing to hide behind a + // channel. We just call it from inside this task. + if (ctx->cli->inputs.size() != 1 || + !fs::is_directory(ctx->cli->inputs[0])) { + if (ctx->is_root) { + std::fprintf(stderr, + "Error: --use-call-tree requires exactly one " + "input directory\n"); } + ctx->failed = true; + ctx->exit_code = 1; + co_return; } - } - // Print active filters - // Print active filters (only on rank 0) - if (is_root && - (!filter_pids.empty() || !exclude_pids.empty() || - !filter_tids.empty() || !exclude_tids.empty() || - !filter_functions.empty() || !exclude_functions.empty() || - !filter_categories.empty() || !exclude_categories.empty() || - start_timestamp > 0 || end_timestamp < UINT64_MAX || min_size >= 0 || - max_size >= 0 || sample_rate < 1.0 || max_events > 0)) { - std::cout << "\nActive Filters:" << std::endl; - if (!filter_pids.empty()) { - std::cout << " Filter PIDs: "; - for (auto pid : filter_pids) std::cout << pid << " "; - std::cout << std::endl; - } - if (!exclude_pids.empty()) { - std::cout << " Exclude PIDs: "; - for (auto pid : exclude_pids) std::cout << pid << " "; - std::cout << std::endl; - } - if (!filter_tids.empty()) { - std::cout << " Filter TIDs: "; - for (auto tid : filter_tids) std::cout << tid << " "; - std::cout << std::endl; - } - if (!exclude_tids.empty()) { - std::cout << " Exclude TIDs: "; - for (auto tid : exclude_tids) std::cout << tid << " "; - std::cout << std::endl; - } - if (!filter_functions.empty()) { - std::cout << " Filter functions: "; - for (const auto& f : filter_functions) std::cout << f << " "; - std::cout << std::endl; - } - if (!exclude_functions.empty()) { - std::cout << " Exclude functions: "; - for (const auto& f : exclude_functions) std::cout << f << " "; - std::cout << std::endl; - } - if (!filter_categories.empty()) { - std::cout << " Filter categories: "; - for (const auto& c : filter_categories) std::cout << c << " "; - std::cout << std::endl; - } - if (!exclude_categories.empty()) { - std::cout << " Exclude categories: "; - for (const auto& c : exclude_categories) std::cout << c << " "; - std::cout << std::endl; - } - if (start_timestamp > 0) { - std::cout << " Start timestamp: " << start_timestamp << std::endl; - } - if (end_timestamp < UINT64_MAX) { - std::cout << " End timestamp: " << end_timestamp << std::endl; - } - if (min_size >= 0) { - std::cout << " Min operation size: " << min_size << " bytes" - << std::endl; - } - if (max_size >= 0) { - std::cout << " Max operation size: " << max_size << " bytes" - << std::endl; - } - if (sample_rate < 1.0) { - std::cout << " Sampling rate: " << (sample_rate * 100.0) << "%" - << std::endl; - } - if (max_events > 0) { - std::cout << " Max events: " << max_events << std::endl; + if (ctx->is_root) { + std::printf("Using call tree hierarchical replay mode\n"); } + ctx->result = engine.replay_with_call_tree(ctx->cli->inputs[0]); + } else { + // Pipelined path: producer (read+parse) feeds Channel; + // consumer (apply_timing+execute) drains it. + co_await engine.run_pipelined(*scope, ctx->trace_files, ctx->result, + ctx->cli->channel_capacity); } - // Create replay engine and execute - if (is_root) std::cout << "\n=== Starting Replay ===" << std::endl; + ctx->execute_ms = std::chrono::duration( + std::chrono::steady_clock::now() - t0) + .count(); - auto start_time = std::chrono::steady_clock::now(); + if (ctx->is_root) { + std::printf("\n=== Replay Completed ===\n"); + std::printf("Wall clock time: %.3f ms\n", ctx->execute_ms); + ctx->result.print_summary(ctx->config.verbose); + } - ReplayEngine engine(config); - ReplayResult result; + // Exit-code semantics preserved from the previous binary: + // 0 = at least one event executed and none failed + // 1 = nothing executed (no events / no trace files) + // 2 = at least one failed event + if (ctx->result.failed_events > 0) { + if (ctx->is_root) std::printf("\nReplay completed with errors.\n"); + ctx->exit_code = 2; + } else if (ctx->result.executed_events > 0) { + if (ctx->is_root) std::printf("\nReplay completed successfully.\n"); + ctx->exit_code = 0; + } else { + if (ctx->is_root) std::printf("\nNo events were executed.\n"); + ctx->exit_code = 1; + } + co_return; +} + +int run(int argc, char** argv) { + DFTRACER_UTILS_LOGGER_INIT(); + + argparse::ArgumentParser program("dftracer_replay", + DFTRACER_UTILS_PACKAGE_VERSION); + program.add_description( + "DFTracer replay utility - replays I/O operations from DFTracer " + "trace files (.pfw, .pfw.gz)"); + + ReplayArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + + RunCtx ctx; + ctx.cli = &cli; - if (use_call_tree) { - // Call tree mode: expects a single directory containing trace files - if (inputs.size() != 1 || !fs::is_directory(inputs[0])) { - if (is_root) - std::cerr << "Error: --use-call-tree requires exactly one " - "input directory" - << std::endl; #ifdef DFTRACER_UTILS_MPI_ENABLED - mpi::MPIUtils::instance().finalize(); - MPI_Finalize(); + ctx.mpi_rank = mpi::MPIUtils::instance().get_rank(); + ctx.mpi_size = mpi::MPIUtils::instance().get_world_size(); + ctx.is_root = mpi::MPIUtils::instance().is_root(); #endif - return 1; - } - if (is_root) - std::cout << "Using call tree hierarchical replay mode" - << std::endl; - result = engine.replay_with_call_tree(inputs[0]); - } else { - // Normal mode: replay trace files directly - result = engine.replay(trace_files); + + // Mirror argparse output into ReplayConfig. + auto& c = ctx.config; + c.maintain_timing = !cli.no_timing; + c.dry_run = cli.dry_run; + c.dftracer_mode = cli.dftracer_mode; + c.no_sleep = cli.no_sleep; + c.verbose = cli.verbose; + c.mpi_rank = ctx.mpi_rank; + c.mpi_size = ctx.mpi_size; + c.use_call_tree = cli.use_call_tree; + c.hierarchical_replay = cli.hierarchical_replay; + c.respect_call_hierarchy = cli.respect_call_hierarchy; + c.filter_pids = parse_csv_uint32(cli.filter_pid_csv); + c.exclude_pids = parse_csv_uint32(cli.exclude_pid_csv); + c.filter_tids = parse_csv_uint32(cli.filter_tid_csv); + c.exclude_tids = parse_csv_uint32(cli.exclude_tid_csv); + c.filter_functions = parse_csv_string(cli.filter_function_csv); + c.exclude_functions = parse_csv_string(cli.exclude_function_csv); + c.filter_categories = parse_csv_string(cli.filter_category_csv); + c.exclude_categories = parse_csv_string(cli.exclude_category_csv); + c.start_timestamp = cli.start_timestamp; + c.end_timestamp = cli.end_timestamp; + c.min_operation_size = cli.min_size; + c.max_operation_size = cli.max_size; + c.sampling_rate = cli.sample_rate; + c.sample_seed = cli.sample_seed; + c.max_events = cli.max_events; + + auto pipeline_config = + cli::build_pipeline_config("DFTracer Replay", cli.pipeline); + Pipeline pipeline(pipeline_config); + + RunCtx* ctx_ptr = &ctx; + auto scan = make_task( + [ctx_ptr](CoroScope&) -> coro::CoroTask { + co_await task_scan(ctx_ptr); + }, + "scan"); + auto execute = make_task( + [ctx_ptr](CoroScope& scope) -> coro::CoroTask { + co_await task_execute(ctx_ptr, &scope); + }, + "execute"); + execute->depends_on(scan); + + pipeline.set_source(scan); + pipeline.set_destination(execute); + pipeline.execute(); + + if (cli.verbose && ctx.is_root) { + std::fprintf(stderr, "[done] scan=%.1fms execute=%.1fms\n", ctx.scan_ms, + ctx.execute_ms); } - auto end_time = std::chrono::steady_clock::now(); - auto total_wall_time = - std::chrono::duration_cast(end_time - - start_time); + return ctx.exit_code; +} - if (is_root) { - std::cout << "\n=== Replay Completed ===" << std::endl; - std::cout << "Wall clock time: " - << static_cast(total_wall_time.count()) / 1000.0 - << " ms" << std::endl; +} // namespace - // Print results - result.print_summary(verbose); - } +int main(int argc, char** argv) { +#ifdef DFTRACER_UTILS_MPI_ENABLED + MPI_Init(&argc, &argv); + mpi::MPIUtils::instance().initialize(); +#endif - // Return appropriate exit code - int exit_code = 0; - if (result.failed_events > 0) { - if (is_root) - std::cout << "\nReplay completed with errors." << std::endl; - exit_code = 2; - } else if (result.executed_events > 0) { - if (is_root) - std::cout << "\nReplay completed successfully." << std::endl; - exit_code = 0; - } else { - if (is_root) std::cout << "\nNo events were executed." << std::endl; - exit_code = 1; - } + int rc = run(argc, argv); #ifdef DFTRACER_UTILS_MPI_ENABLED mpi::MPIUtils::instance().finalize(); MPI_Finalize(); #endif - - return exit_code; + return rc; } diff --git a/src/dftracer/utils/binaries/dftracer_server.cpp b/src/dftracer/utils/binaries/dftracer_server.cpp index c740b21c..f17b951b 100644 --- a/src/dftracer/utils/binaries/dftracer_server.cpp +++ b/src/dftracer/utils/binaries/dftracer_server.cpp @@ -1,11 +1,7 @@ #include -#include -#include -#include #include #include #include -#include #include #include #include @@ -16,28 +12,65 @@ #include #include -#include #include #include #include -#include + +#include "common_cli.h" using namespace dftracer::utils; using namespace dftracer::utils::server; -static coro::CoroTask run_server(argparse::ArgumentParser& program) { - std::string bind_addr = program.get("--bind"); - uint16_t port = program.get("--port"); - std::string directory = program.get("--directory"); - std::string index_dir = program.get("--index-dir"); - std::size_t executor_threads = - program.get("--executor-threads"); - - // When no explicit index dir is given, default to the trace - // directory so `.dftindex` stores persist across restarts - // and don't need to be rebuilt every time. +class ServerArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{cli::DirMode::REQUIRED}; + cli::PipelineArgs pipeline; + + std::string index_dir; + std::string bind_addr = "0.0.0.0"; + uint16_t port = 8080; + + explicit ServerArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + schema(directory, pipeline); + } + + protected: + void register_args() override { + parser() + .add_argument("--index-dir") + .help( + "Directory for root-local .dftindex stores (default: same as " + "--directory)") + .default_value(""); + + parser() + .add_argument("-b", "--bind") + .help("Bind address") + .default_value("0.0.0.0"); + + parser() + .add_argument("-p", "--port") + .help("Listen port") + .scan<'d', uint16_t>() + .default_value(static_cast(8080)); + } + + void post_parse() override { + index_dir = parser().get("--index-dir"); + bind_addr = parser().get("--bind"); + port = parser().get("--port"); + } +}; + +static coro::CoroTask run_server(const ServerArgParse* cli) { + const auto& bind_addr = cli->bind_addr; + auto port = cli->port; + const auto& dir = cli->directory.value; + auto index_dir = cli->index_dir; + auto executor_threads = cli->pipeline.executor_threads; + if (index_dir.empty()) { - index_dir = directory; + index_dir = dir; std::fprintf(stderr, "Using trace directory for indexes: %s\n", index_dir.c_str()); } else { @@ -45,28 +78,22 @@ static coro::CoroTask run_server(argparse::ArgumentParser& program) { } auto pipeline_config = - PipelineConfig() - .with_name("DFTracer Server") - .with_compute_threads(executor_threads) - .with_watchdog(false) // Server is long-lived; no watchdog - .with_global_timeout(std::chrono::seconds(0)) // Run forever - .with_task_timeout(std::chrono::seconds(0)) // No per-task timeout - .with_io_backend( - io::IoBackendType::THREADPOOL) // Thread pool IO for server - .with_io_batch_size(1); + cli::build_pipeline_config("DFTracer Server", cli->pipeline); + pipeline_config.with_io_backend(io::IoBackendType::THREADPOOL) + .with_io_batch_size(1) + .with_watchdog(false) + .with_global_timeout(std::chrono::seconds(0)) + .with_task_timeout(std::chrono::seconds(0)); Pipeline pipeline(pipeline_config); - // Build trace index (scan directory, load bloom indexes) - TraceIndex trace_index(directory, index_dir, executor_threads); + TraceIndex trace_index(dir, index_dir, executor_threads); co_await trace_index.initialize(); - // Set up router Router router; register_trace_api(router, trace_index); register_viz_api(router, trace_index); - // Start TCP listener TcpListener listener(bind_addr, port); if (!listener.start()) { DFTRACER_UTILS_LOG_ERROR("Failed to bind to %s:%u", bind_addr.c_str(), @@ -77,9 +104,8 @@ static coro::CoroTask run_server(argparse::ArgumentParser& program) { std::fprintf(stderr, "DFTracer server listening on %s:%u\n", bind_addr.c_str(), port); std::fprintf(stderr, "Serving %zu trace files from %s\n", - trace_index.file_count(), directory.c_str()); + trace_index.file_count(), dir.c_str()); - // Register listen fd so signal handler can unblock accept(). g_listen_fd.store(listener.fd(), std::memory_order_release); auto server_task = make_task( @@ -97,10 +123,6 @@ static coro::CoroTask run_server(argparse::ArgumentParser& program) { pipeline.set_source(server_task); pipeline.set_destination(server_task); - // Run until SIGINT/SIGTERM. - // Signal handler sets g_shutdown_requested, which causes accept_loop - // to break, CoroScope drains in-flight handlers, and - // pipeline.execute() returns. pipeline.execute(); std::fprintf(stderr, "Server shut down gracefully\n"); @@ -117,40 +139,11 @@ int main(int argc, char** argv) { "Serve DFTracer trace data over HTTP. Query, filter, and stream " "trace events via REST API."); - program.add_argument("-b", "--bind") - .help("Bind address") - .default_value("0.0.0.0"); - - program.add_argument("-p", "--port") - .help("Listen port") - .scan<'d', uint16_t>() - .default_value(static_cast(8080)); - - program.add_argument("-d", "--directory") - .help("Directory containing trace files") - .required(); - - program.add_argument("--index-dir") - .help( - "Directory for root-local .dftindex stores (default: same as " - "--directory)") - .default_value(""); - - program.add_argument("--executor-threads") - .help("Number of worker threads") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); + ServerArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; install_signal_handlers(); - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what()); - std::cerr << program; - return 1; - } - - return run_server(program).get(); + return run_server(&cli).get(); } diff --git a/src/dftracer/utils/binaries/dftracer_split.cpp b/src/dftracer/utils/binaries/dftracer_split.cpp index 8be38958..03ddf4c9 100644 --- a/src/dftracer/utils/binaries/dftracer_split.cpp +++ b/src/dftracer/utils/binaries/dftracer_split.cpp @@ -1,8 +1,6 @@ #include -#include -#include #include -#include +#include #include #include #include @@ -14,10 +12,11 @@ #include #include -#include #include #include +#include "common_cli.h" + using namespace dftracer::utils; using namespace dftracer::utils::task_graph; using Metadata = utilities::composites::dft::MetadataCollectorUtilityOutput; @@ -26,142 +25,83 @@ using ChunkManifest = using ExtractInput = utilities::composites::dft::ChunkExtractorUtilityInput; using ExtractResult = utilities::composites::dft::ChunkExtractorUtilityOutput; -int main(int argc, char** argv) { - DFTRACER_UTILS_LOGGER_INIT(); - - auto default_checkpoint_size_str = - std::to_string(dftracer::utils::utilities::indexer::internal::Indexer:: - DEFAULT_CHECKPOINT_SIZE) + - " B (" + - std::to_string(dftracer::utils::utilities::indexer::internal::Indexer:: - DEFAULT_CHECKPOINT_SIZE / - (1024 * 1024)) + - " MB)"; +class SplitArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + cli::WatchdogArgs watchdog; + + std::string app_name = "app"; + std::string output_dir = "./split"; + int chunk_size_mb = 4; + bool compress = true; + bool verbose = false; + bool verify = false; + + explicit SplitArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.force_help = + "Override existing files and force index recreation"; + schema(directory, pipeline, indexing, watchdog); + } - argparse::ArgumentParser program("dftracer_split", - DFTRACER_UTILS_PACKAGE_VERSION); - program.add_description( - "Split DFTracer traces into equal-sized chunks using explicit pipeline " - "with maximum parallelism"); + protected: + void register_args() override { + parser() + .add_argument("-n", "--app-name") + .help("Application name for output files") + .default_value("app"); + + parser() + .add_argument("-o", "--output") + .help("Output directory for split files") + .default_value("./split"); + + parser() + .add_argument("-s", "--chunk-size") + .help("Chunk size in MB") + .scan<'d', int>() + .default_value(4); + + parser() + .add_argument("-c", "--compress") + .help("Compress output files with gzip") + .flag() + .default_value(true); + + parser() + .add_argument("-v", "--verbose") + .help("Enable verbose mode") + .flag(); + + parser() + .add_argument("--verify") + .help("Verify output chunks match input by comparing event IDs") + .flag(); + } - program.add_argument("-n", "--app-name") - .help("Application name for output files") - .default_value("app"); - - program.add_argument("-d", "--directory") - .help("Input directory containing .pfw or .pfw.gz files") - .default_value("."); - - program.add_argument("-o", "--output") - .help("Output directory for split files") - .default_value("./split"); - - program.add_argument("-s", "--chunk-size") - .help("Chunk size in MB") - .scan<'d', int>() - .default_value(4); - - program.add_argument("-f", "--force") - .help("Override existing files and force index recreation") - .flag(); - - program.add_argument("-c", "--compress") - .help("Compress output files with gzip") - .flag() - .default_value(true); - - program.add_argument("-v", "--verbose").help("Enable verbose mode").flag(); - - program.add_argument("--checkpoint-size") - .help("Checkpoint size for indexing in bytes (default: " + - default_checkpoint_size_str + ")") - .scan<'d', std::size_t>() - .default_value(static_cast( - dftracer::utils::utilities::indexer::internal::Indexer:: - DEFAULT_CHECKPOINT_SIZE)); - - program.add_argument("--executor-threads") - .help( - "Number of executor threads for parallel processing (default: " - "number " - "of CPU cores)") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - program.add_argument("--index-dir") - .help("Directory to store index files (default: system temp directory)") - .default_value(""); - - program.add_argument("--verify") - .help("Verify output chunks match input by comparing event IDs") - .flag(); - - program.add_argument("--disable-watchdog") - .help("Disable watchdog for hang detection") - .flag(); - - program.add_argument("--watchdog-global-timeout") - .help( - "Watchdog global timeout for pipeline execution in seconds (0 = no " - "timeout)") - .scan<'d', int>() - .default_value(0); - - program.add_argument("--watchdog-task-timeout") - .help("Watchdog default task timeout in seconds (0 = no timeout)") - .scan<'d', int>() - .default_value(0); - - program.add_argument("--watchdog-interval") - .help("Watchdog check interval in seconds") - .scan<'d', int>() - .default_value(1); - - program.add_argument("--watchdog-warning-threshold") - .help("Watchdog long-running task warning threshold in seconds") - .scan<'d', int>() - .default_value(300); - - program.add_argument("--watchdog-idle-timeout") - .help("Watchdog idle timeout in seconds (0 = use default)") - .scan<'d', int>() - .default_value(300); - - program.add_argument("--watchdog-deadlock-timeout") - .help("Watchdog deadlock timeout in seconds (0 = use default)") - .scan<'d', int>() - .default_value(600); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what()); - std::cerr << program << std::endl; - return 1; + void post_parse() override { + app_name = parser().get("--app-name"); + output_dir = parser().get("--output"); + chunk_size_mb = parser().get("--chunk-size"); + compress = parser().get("--compress"); + verbose = parser().get("--verbose"); + verify = parser().get("--verify"); } +}; + +static coro::CoroTask run_split(const SplitArgParse* cli) { + const auto log_dir = fs::absolute(cli->directory.value).string(); + const auto output_dir = fs::absolute(cli->output_dir).string(); + const auto& app_name = cli->app_name; + const auto chunk_size_mb = cli->chunk_size_mb; + const auto force = cli->indexing.force; + const auto compress = cli->compress; + const auto verify = cli->verify; + const auto checkpoint_size = cli->indexing.checkpoint_size; + const auto executor_threads = cli->pipeline.executor_threads; + auto index_dir = cli->indexing.index_dir; - // Parse arguments - std::string app_name = program.get("--app-name"); - std::string log_dir = program.get("--directory"); - std::string output_dir = program.get("--output"); - int chunk_size_mb = program.get("--chunk-size"); - bool force = program.get("--force"); - bool compress = program.get("--compress"); - bool verify = program.get("--verify"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - std::size_t executor_threads = - program.get("--executor-threads"); - std::string index_dir = program.get("--index-dir"); - bool disable_watchdog = program.get("--disable-watchdog"); - int global_timeout = program.get("--watchdog-global-timeout"); - int task_timeout = program.get("--watchdog-task-timeout"); - int watchdog_interval = program.get("--watchdog-interval"); - int warning_threshold = program.get("--watchdog-warning-threshold"); - int idle_timeout = program.get("--watchdog-idle-timeout"); - int deadlock_timeout = program.get("--watchdog-deadlock-timeout"); - - // Setup temp index directory std::string temp_index_dir; if (index_dir.empty()) { temp_index_dir = fs::temp_directory_path() / @@ -173,9 +113,6 @@ int main(int argc, char** argv) { index_dir.c_str()); } - log_dir = fs::absolute(log_dir).string(); - output_dir = fs::absolute(output_dir).string(); - std::printf("==========================================\n"); std::printf("DFTracer Split (Explicit Pipeline)\n"); std::printf("==========================================\n"); @@ -193,20 +130,8 @@ int main(int argc, char** argv) { fs::create_directories(output_dir); } - // Create pipeline with configuration - auto pipeline_config = - PipelineConfig() - .with_name("DFTracer Split") - .with_compute_threads(executor_threads) - .with_watchdog(!disable_watchdog) - .with_global_timeout(std::chrono::seconds(global_timeout)) - .with_task_timeout(std::chrono::seconds(task_timeout)) - .with_watchdog_interval(std::chrono::seconds(watchdog_interval)) - .with_warning_threshold(std::chrono::seconds(warning_threshold)) - .with_executor_idle_timeout(std::chrono::seconds(idle_timeout)) - .with_executor_deadlock_timeout( - std::chrono::seconds(deadlock_timeout)); - + auto pipeline_config = cli::build_pipeline_config( + "DFTracer Split", cli->pipeline, cli->watchdog); Pipeline pipeline(pipeline_config); auto start_time = std::chrono::high_resolution_clock::now(); @@ -227,7 +152,7 @@ int main(int argc, char** argv) { if (input_files.empty()) { DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in %s", log_dir.c_str()); - return 1; + co_return 1; } DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size()); @@ -247,30 +172,53 @@ int main(int argc, char** argv) { auto graph = TaskGraph::builder( {.name = "DFTracerSplit", .max_concurrency = executor_threads}); - DFTRACER_UTILS_LOG_INFO("%s", "Creating file processing tasks..."); + DFTRACER_UTILS_LOG_INFO("%s", "Creating batch index task..."); auto* input_files_ptr = &input_files; + auto batch_index_task = make_task( + [input_files_ptr, checkpoint_size, index_dir, + executor_threads](CoroScope& ctx) -> coro::CoroTask { + auto index_path = + utilities::composites::dft::internal::determine_index_path( + input_files_ptr->front(), index_dir); + dftracer::utils::rocksdb::RocksDBManager::instance().reset( + index_path); + + auto batch_config = + std::make_shared(); + batch_config->file_paths = *input_files_ptr; + batch_config->index_dir = index_dir; + batch_config->checkpoint_size = checkpoint_size; + batch_config->parallelism = executor_threads; + batch_config->use_batch_write = true; + batch_config->rebuild_root_summaries = true; + + auto result = + co_await utilities::indexer::IndexBatchBuilderUtility::process( + &ctx, std::move(batch_config)); + for (const auto& r : result.results) { + if (!r.success && !r.error_message.empty()) { + DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s", + r.file_path.c_str(), + r.error_message.c_str()); + } + } + }, + "BatchIndex"); + graph.add(batch_index_task); + + DFTRACER_UTILS_LOG_INFO("%s", "Creating file processing tasks..."); + auto file_metadata = graph.parallel( input_files.size(), - [input_files_ptr, checkpoint_size, force, index_dir, verify]( + [input_files_ptr, checkpoint_size, index_dir, verify]( CoroScope&, std::size_t idx) -> coro::CoroTask { const auto& file_path = (*input_files_ptr)[idx]; - // Determine index path std::string index_path = utilities::composites::dft::internal::determine_index_path( file_path, index_dir); - // Build index - auto idx_input = - utilities::indexer::IndexBuildConfig::for_file(file_path) - .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(false) - .with_index_dir(index_dir); - co_await utilities::indexer::IndexBuilderUtility{}.process( - idx_input); - - // Collect metadata auto meta_input = utilities::composites::dft::MetadataCollectorUtilityInput:: from_file(file_path) @@ -285,6 +233,10 @@ int main(int argc, char** argv) { }, {.name = "ProcessFile"}); + for (const auto& meta_task : file_metadata.tasks()) { + meta_task->depends_on(batch_index_task); + } + DFTRACER_UTILS_LOG_INFO("%s", "Creating chunk mapping task..."); auto manifests_group = graph.reduce>( @@ -359,7 +311,6 @@ int main(int argc, char** argv) { results.push_back(co_await future); } - // Sort by chunk index std::sort(results.begin(), results.end(), [](const ExtractResult& a, const ExtractResult& b) { return a.chunk_index < b.chunk_index; @@ -384,20 +335,15 @@ int main(int argc, char** argv) { std::vector all_metadata; }; - // Verification task receives both extraction results and metadata. - // Both are passed via combiner so the scheduler keeps parent - // results alive until this task consumes them. task_verify_chunks = make_task( [](CoroScope&, const VerifyInput& input) -> coro::CoroTask< utilities::composites::ChunkVerificationUtilityOutput> { - // Sum output hashes from extraction results std::size_t output_hash = 0; for (const auto& chunk : input.chunks) { output_hash += chunk.event_hash; } - // Sum input hashes from metadata (computed during collection) std::size_t input_hash = 0; for (const auto& meta : input.all_metadata) { if (!meta.success) continue; @@ -411,8 +357,6 @@ int main(int argc, char** argv) { }, "VerifyChunks"); - // Depend on both extract results and metadata tasks. - // The combiner collects parent outputs into the typed struct. task_verify_chunks->depends_on(task_extract_chunks); for (const auto& meta_task : file_metadata.tasks()) { task_verify_chunks->depends_on(meta_task); @@ -420,8 +364,6 @@ int main(int argc, char** argv) { task_verify_chunks->with_combiner( [](const std::vector& inputs) -> std::any { - // inputs[0] = ExtractChunksOutput (from extract task) - // inputs[1..N] = Metadata (from each metadata task) auto chunks = std::any_cast(inputs[0]); std::vector all_metadata; @@ -441,12 +383,10 @@ int main(int argc, char** argv) { // Phase 4: Execute Pipeline DFTRACER_UTILS_LOG_INFO("%s", "Executing pipeline..."); - pipeline.set_source(file_metadata.tasks()); + pipeline.set_source(batch_index_task); pipeline.set_destination(final_task); pipeline.execute(); - // Get results from the destination task only (intermediate task values - // are released after pipeline execution) auto end_time = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end_time - start_time; @@ -476,7 +416,6 @@ int main(int argc, char** argv) { std::printf(" Output hash: 0x%016" PRIx64 "\n", verify_result.output_hash); } else { - // Without verification: extract task IS the destination, safe to read auto extraction_results = task_extract_chunks->get(); @@ -503,12 +442,27 @@ int main(int argc, char** argv) { std::printf("==========================================\n"); - // Cleanup temporary index directory if created if (!temp_index_dir.empty() && fs::exists(temp_index_dir)) { DFTRACER_UTILS_LOG_INFO("Cleaning up temporary index directory: %s", temp_index_dir.c_str()); fs::remove_all(temp_index_dir); } - return exit_code; + co_return exit_code; +} + +int main(int argc, char** argv) { + DFTRACER_UTILS_LOGGER_INIT(); + + argparse::ArgumentParser program("dftracer_split", + DFTRACER_UTILS_PACKAGE_VERSION); + program.add_description( + "Split DFTracer traces into equal-sized chunks using explicit pipeline " + "with maximum parallelism"); + + SplitArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; + + return run_split(&cli).get(); } diff --git a/src/dftracer/utils/binaries/dftracer_stats.cpp b/src/dftracer/utils/binaries/dftracer_stats.cpp index 5b9e322d..c5e05210 100644 --- a/src/dftracer/utils/binaries/dftracer_stats.cpp +++ b/src/dftracer/utils/binaries/dftracer_stats.cpp @@ -1,45 +1,55 @@ #include -#include -#include -#include #include #include -#include +#include #include #include +#include +#include +#include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include +#include #include #include #include #include +#include +#include #include +#include #include +#include #include -#include #include #include #include +#include #include -#include +#include #include +#include #include #include #include #include #include +#include "common_cli.h" + using namespace dftracer::utils; using namespace dftracer::utils::utilities; using namespace dftracer::utils::utilities::composites::dft; @@ -50,35 +60,297 @@ using common::query::Query; using dftracer::utils::utilities::composites::dft::DFTracerEvent; using dftracer::utils::utilities::fileio::lines::sources:: async_streaming_gz_lines; -using dftracer::utils::utilities::indexer::IndexBuildConfig; -using dftracer::utils::utilities::indexer::IndexBuilderUtility; +using dftracer::utils::utilities::indexer::ChunkStatistics; +using dftracer::utils::utilities::indexer::FileRegistryEntry; +using dftracer::utils::utilities::indexer::has_capability; using dftracer::utils::utilities::indexer::IndexDatabase; +using dftracer::utils::utilities::indexer::IndexFileEntryCapability; +using dftracer::utils::utilities::indexer::RootStatisticsResult; +namespace cli = dftracer::utils::cli; + +struct StatsConfig { + std::string directory; + std::string index_dir; + bool json_output = false; + std::uint64_t top_n = 0; + std::uint64_t top_n_pid_tid = 10; + bool no_auto_index = false; + std::size_t checkpoint_size = 0; + std::size_t executor_threads = 0; + std::optional query; + std::vector filter_names; + std::vector filter_cats; + std::vector group_by; + StatisticsQueryType report_type = StatisticsQueryType::SUMMARY; +}; + +class StatsArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{cli::DirMode::DEFAULT_EMPTY}; + cli::FilesArgs files_args{"Trace files to inspect (.pfw, .pfw.gz)"}; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + cli::QueryArgs query_args; + + bool json_output = false; + std::string report_str = "summary"; + std::uint64_t top_n = 0; + std::uint64_t top_n_pid_tid = 10; + bool no_auto_index = false; + std::vector group_by; + std::vector filter_names; + std::vector filter_cats; + + explicit StatsArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.with_force = false; + indexing.index_dir_help = + "Directory where .dftindex stores are created"; + schema(directory, files_args, pipeline, indexing, query_args); + } + + bool to_config(StatsConfig& config) const { + config.directory = directory.value; + config.index_dir = indexing.index_dir; + config.json_output = json_output; + config.top_n = top_n; + config.top_n_pid_tid = top_n_pid_tid; + config.no_auto_index = no_auto_index; + config.checkpoint_size = indexing.checkpoint_size; + config.executor_threads = pipeline.executor_threads; + config.filter_names = filter_names; + config.filter_cats = filter_cats; + config.group_by = group_by; + + const auto& query_str_val = query_args.query; + if (!query_str_val.empty()) { + auto result = Query::from_string(query_str_val); + if (!result) { + DFTRACER_UTILS_LOG_ERROR("Invalid --query: %s", + result.error().format().c_str()); + return false; + } + config.query = std::move(*result); + } + + config.report_type = parse_report_type_str(report_str); + + if (config.report_type == StatisticsQueryType::DETAILED && + config.group_by.empty()) { + config.group_by.push_back("name"); + } + + return true; + } + + protected: + void register_args() override { + parser().add_argument("--json").help("Output in JSON format").flag(); + + parser() + .add_argument("--report") + .help( + "Report type: summary, categories, names, pid_tids, " + "time_range, " + "duration, top-names, top-categories, detailed") + .default_value("summary"); + + parser() + .add_argument("--top-n") + .help( + "Number of results for top-N queries (0 = show all, " + "default: 0)") + .scan<'d', std::uint64_t>() + .default_value(static_cast(0)); + + parser() + .add_argument("--top-n-pid-tid") + .help("Max PID:TID pairs to display (0 = show all, default: 10)") + .scan<'d', std::uint64_t>() + .default_value(static_cast(10)); + + parser() + .add_argument("--no-auto-index") + .help( + "Disable automatic index building for files missing .dftindex") + .flag(); + + parser() + .add_argument("--group-by") + .help( + "Group detailed statistics by dimension(s): name, cat, pid, " + "tid, fhash, hhash, pid_tid. Multiple values create composite " + "keys.") + .nargs(argparse::nargs_pattern::at_least_one) + .default_value>({}); + + parser() + .add_argument("--filter-names") + .help("Filter by event names") + .nargs(argparse::nargs_pattern::any) + .default_value>({}); + + parser() + .add_argument("--filter-cats") + .help("Filter by event categories") + .nargs(argparse::nargs_pattern::any) + .default_value>({}); + } + + void post_parse() override { + json_output = parser().get("--json"); + report_str = parser().get("--report"); + top_n = parser().get("--top-n"); + top_n_pid_tid = parser().get("--top-n-pid-tid"); + no_auto_index = parser().get("--no-auto-index"); + group_by = parser().get>("--group-by"); + filter_names = parser().get>("--filter-names"); + filter_cats = parser().get>("--filter-cats"); + } + + bool validate() override { + const std::vector valid_reports = { + "summary", "categories", "names", + "pid_tids", "time_range", "duration", + "top-names", "top-categories", "detailed"}; + if (std::find(valid_reports.begin(), valid_reports.end(), report_str) == + valid_reports.end()) { + DFTRACER_UTILS_LOG_ERROR("Invalid --report value: %s", + report_str.c_str()); + return false; + } + + const std::vector valid_dims = { + "name", "cat", "pid", "tid", "fhash", "hhash", "pid_tid"}; + for (const auto& dim : group_by) { + if (std::find(valid_dims.begin(), valid_dims.end(), dim) == + valid_dims.end()) { + DFTRACER_UTILS_LOG_ERROR( + "Invalid --group-by dimension: %s. Valid: name, cat, pid, " + "tid, fhash, hhash, pid_tid", + dim.c_str()); + return false; + } + } + + return true; + } + + private: + static StatisticsQueryType parse_report_type_str(const std::string& s) { + if (s == "summary") return StatisticsQueryType::SUMMARY; + if (s == "categories") return StatisticsQueryType::CATEGORIES; + if (s == "names") return StatisticsQueryType::NAMES; + if (s == "pid_tids") return StatisticsQueryType::PID_TIDS; + if (s == "time_range") return StatisticsQueryType::TIME_RANGE; + if (s == "duration") return StatisticsQueryType::DURATION_STATS; + if (s == "top-names") return StatisticsQueryType::TOP_N_NAMES; + if (s == "top-categories") return StatisticsQueryType::TOP_N_CATEGORIES; + if (s == "detailed") return StatisticsQueryType::DETAILED; + return StatisticsQueryType::SUMMARY; + } +}; + +using indexing::FileWorkItem; +using indexing::IndexResolverUtility; +using indexing::ResolvedFile; +using indexing::ResolverInput; +using indexing::ResolverResult; + +struct IndexPartition { + std::vector files_needing_index; + std::vector indexed_entries; + ResolverResult resolver_result; + std::vector> precomputed_failures; + std::vector> precomputed_successes; +}; + +struct AggregateStatsResult { + std::vector> indexed_stats; + TraceStatistics total; + std::size_t successful_count = 0; + std::size_t failed_count = 0; + std::int64_t read_elapsed_ns = 0; + std::unordered_map read_counters; +}; + +struct IndexedRootSnapshot { + std::vector logical_files; + IndexPartition partition; +}; + +static void append_failed_stats_result( + std::vector>& results, + std::size_t file_index, const std::string& file_path, + const std::string& error_message) { + TraceStatistics failed; + failed.file_path = file_path; + failed.success = false; + failed.error_message = error_message; + results.emplace_back(file_index, std::move(failed)); +} + +static void append_empty_indexed_stats_result( + std::vector>& results, + std::size_t file_index, const std::string& file_path, + const std::string& index_path) { + TraceStatistics stats; + stats.file_path = file_path; + stats.index_path = index_path; + stats.success = true; + stats.num_chunks = 0; + results.emplace_back(file_index, std::move(stats)); +} -// Files below this compressed size are scanned directly without building -// `.dftindex` stores. At 8 MB compressed (~160 MB -// uncompressed with typical 20x JSON compression), a file has only a -// handful of 32 MB checkpoints — the indexing overhead exceeds the -// benefit of bloom-filter skip. -static constexpr std::size_t INDEX_SIZE_THRESHOLD = - constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; - -static StatisticsQueryType parse_report_type_str(const std::string& s) { - if (s == "summary") return StatisticsQueryType::SUMMARY; - if (s == "categories") return StatisticsQueryType::CATEGORIES; - if (s == "names") return StatisticsQueryType::NAMES; - if (s == "pid_tids") return StatisticsQueryType::PID_TIDS; - if (s == "time_range") return StatisticsQueryType::TIME_RANGE; - if (s == "duration") return StatisticsQueryType::DURATION_STATS; - if (s == "top-names") return StatisticsQueryType::TOP_N_NAMES; - if (s == "top-categories") return StatisticsQueryType::TOP_N_CATEGORIES; - if (s == "detailed") return StatisticsQueryType::DETAILED; - return StatisticsQueryType::SUMMARY; +static double ns_to_ms(std::uint64_t ns) { + return static_cast(ns) / 1'000'000.0; +} + +static coro::CoroTask> +process_index_group_root_summary(std::string index_path, + std::size_t expected_indexed_files, + StatisticsQueryType report_type) { + IndexDatabase idx_db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto scalar_stats = idx_db.query_root_scalar_stats(); + + if (!scalar_stats || scalar_stats->num_files != expected_indexed_files) { + co_return std::nullopt; + } + + TraceStatistics result; + result.file_path = index_path; + result.index_path = index_path; + result.num_chunks = scalar_stats->num_chunks; + result.merged = scalar_stats->stats; + result.success = true; + + const bool needs_categories = + report_type == StatisticsQueryType::SUMMARY || + report_type == StatisticsQueryType::CATEGORIES || + report_type == StatisticsQueryType::TOP_N_CATEGORIES; + const bool needs_names = report_type == StatisticsQueryType::NAMES || + report_type == StatisticsQueryType::TOP_N_NAMES; + const bool needs_pid_tids = report_type == StatisticsQueryType::SUMMARY || + report_type == StatisticsQueryType::PID_TIDS; + + if (needs_categories) { + idx_db.merge_root_category_counts_into(result.merged); + } + if (needs_names) { + idx_db.merge_root_name_counts_into(result.merged); + } + if (needs_pid_tids) { + idx_db.merge_root_pid_tid_counts_into(result.merged); + } + + co_return result; } using CountPair = std::pair; -static std::vector sorted_by_count_desc( - const std::unordered_map& counts) { +template +static std::vector sorted_by_count_desc(const Map& counts) { std::vector sorted(counts.begin(), counts.end()); std::sort(sorted.begin(), sorted.end(), [](const CountPair& a, const CountPair& b) { @@ -456,214 +728,14 @@ static void print_text_detailed( std::printf("\n"); } -// Direct-scan a small .pfw.gz file without any persisted index store. -// Streams lines via async_streaming_gz_lines, parses each with yyjson, -// and accumulates stats via ChunkStatistics::update_from_event(). -static coro::CoroTask direct_scan_trace_statistics( - std::string file_path) { - TraceStatistics result; - result.file_path = file_path; - - try { - auto gen = async_streaming_gz_lines(file_path); - ChunkStatistics stats; - - while (auto line = co_await gen.next()) { - if (line->content.empty()) continue; - - yyjson_doc* doc = yyjson_read_opts( - const_cast(line->content.data()), line->content.size(), - YYJSON_READ_NOFLAG, nullptr, nullptr); - if (!doc) continue; - - yyjson_val* root = yyjson_doc_get_root(doc); - if (root && yyjson_is_obj(root)) { - DFTracerEvent ev; - if (DFTracerEvent::parse(root, ev) && !ev.is_metadata()) { - stats.update_from_event(ev.name, ev.cat, ev.pid, ev.tid, - ev.ts, ev.dur); - } - } - yyjson_doc_free(doc); - } - - result.merged = stats; - result.num_chunks = 1; - result.success = true; - } catch (const std::exception& e) { - result.success = false; - result.error_message = "Failed to scan: " + file_path + ": " + e.what(); - } - - co_return result; -} - -// Direct-scan a small .pfw.gz for the detailed query path. -// Applies name/category filters and group-by dimensions. -static coro::CoroTask direct_scan_detailed_statistics( - std::string file_path, const std::vector* filter_names_ptr, - const std::vector* filter_cats_ptr, - const std::vector* group_by_ptr) { - DetailedStatistics result; - - // Build filter sets from pointer args (pointers are safe: caller's scope - // outlives this coroutine). - std::unordered_set name_filter; - std::unordered_set cat_filter; - for (const auto& n : *filter_names_ptr) name_filter.insert(n); - for (const auto& c : *filter_cats_ptr) cat_filter.insert(c); - bool has_name_filter = !name_filter.empty(); - bool has_cat_filter = !cat_filter.empty(); - bool has_grouping = !group_by_ptr->empty(); - - // I/O event names (same list as chunk_detail_scanner_utility.cpp) - static constexpr auto IO_EVENTS = std::to_array( - {"read", "write", "pread", "pwrite", "pread64", "pwrite64", "readv", - "writev"}); - auto is_io = [](std::string_view name) { - return std::find(IO_EVENTS.begin(), IO_EVENTS.end(), name) != - IO_EVENTS.end(); - }; - - try { - auto gen = async_streaming_gz_lines(file_path); - - while (auto line = co_await gen.next()) { - if (line->content.empty()) continue; - - yyjson_doc* doc = yyjson_read_opts( - const_cast(line->content.data()), line->content.size(), - YYJSON_READ_NOFLAG, nullptr, nullptr); - if (!doc) continue; - - yyjson_val* root = yyjson_doc_get_root(doc); - if (root && yyjson_is_obj(root)) { - DFTracerEvent ev; - if (DFTracerEvent::parse(root, ev) && !ev.is_metadata()) { - std::string_view name_sv = ev.name; - std::string_view cat_sv = ev.cat; - - bool passes = true; - if (has_name_filter && - name_filter.find(name_sv) == name_filter.end()) { - passes = false; - } - if (passes && has_cat_filter && - cat_filter.find(cat_sv) == cat_filter.end()) { - passes = false; - } - - if (passes) { - double dur = static_cast(ev.dur); - result.duration.update(dur); - - std::string io_key; - - if (has_grouping) { - // Build group key inline (same logic as - // chunk_detail_scanner_utility.cpp) - constexpr std::size_t KEY_BUF_SIZE = - utilities::common::json::YYJSON_LINE_POOL_SIZE; - char buf[KEY_BUF_SIZE]; - char* p = buf; - char* end = buf + KEY_BUF_SIZE - 1; - - auto append_sv = [&](std::string_view sv) { - std::size_t n = - std::min(sv.size(), - static_cast(end - p)); - std::memcpy(p, sv.data(), n); - p += n; - }; - auto append_uint = [&](std::uint64_t v) { - p += std::snprintf( - p, static_cast(end - p + 1), - "%llu", static_cast(v)); - }; - - for (std::size_t i = 0; i < group_by_ptr->size(); - ++i) { - if (p >= end) break; - if (i > 0) *p++ = '|'; - const auto& dim = (*group_by_ptr)[i]; - if (dim == "name") { - append_sv(ev.name); - } else if (dim == "cat") { - append_sv(ev.cat); - } else if (dim == "pid") { - append_uint(ev.pid); - } else if (dim == "tid") { - append_uint(ev.tid); - } else if (dim == "pid_tid") { - append_uint(ev.pid); - if (p < end) *p++ = ':'; - append_uint(ev.tid); - } else if (dim == "fhash") { - if (ev.args.exists()) - append_sv(ev.args["fhash"] - .get()); - } else if (dim == "hhash") { - if (ev.args.exists()) - append_sv(ev.args["hhash"] - .get()); - } - } - std::string key(buf, p); - result.grouped_duration[key].update(dur); - result.group_key_category.emplace( - key, std::string(cat_sv)); - io_key = std::move(key); - } else { - io_key = "__global__"; - } - - if (is_io(name_sv) && ev.args.exists()) { - auto ret_opt = - ev.args["ret"].get_optional(); - if (ret_opt.has_value() && ret_opt.value() > 0) { - double ret = - static_cast(ret_opt.value()); - auto& io = result.grouped_io[io_key]; - io.duration.update(dur); - io.size.update(ret); - if (dur > 0) { - io.bandwidth.update(ret * 1e6 / dur); - } - auto offset_opt = - ev.args["offset"] - .get_optional(); - if (offset_opt.has_value()) { - io.offset.update(static_cast( - offset_opt.value())); - } - } - } - - result.events_scanned++; - } - } - } - yyjson_doc_free(doc); - } - - result.chunks_scanned = 1; - } catch (const std::exception&) { - // Return empty result on open/read failure (matches original behaviour) - } - - co_return result; -} - // Per-chunk scanning coroutine for parallel detailed stats. // Scans a single chunk and merges results into shared file_detailed. -static coro::CoroTask scan_chunk_detailed( +static coro::CoroTask> scan_chunk_detailed( std::string file_path, std::string index_path, std::size_t checkpoint_size, std::size_t file_size, std::size_t num_ckpts, std::uint64_t ckpt_idx, const std::vector* filter_names_ptr, const std::vector* filter_cats_ptr, - const std::vector* group_by_ptr, - std::shared_ptr file_detailed, - std::shared_ptr chunk_mutex) { + const std::vector* group_by_ptr) { std::size_t start_byte = 0; std::size_t end_byte = file_size; @@ -689,11 +761,10 @@ static coro::CoroTask scan_chunk_detailed( auto scan_output = co_await scanner.process(scan_input); if (scan_output.success) { - std::lock_guard lock(*chunk_mutex); - file_detailed->merge(scan_output.stats); + co_return scan_output.stats; } - co_return; + co_return std::nullopt; } // Per-file detailed stats coroutine. Spawns parallel chunk scans, @@ -761,67 +832,77 @@ static coro::CoroTask process_file_detailed( } } - // Scan candidate chunks in parallel - auto file_detailed = std::make_shared(); - file_detailed->chunks_skipped = + // Scan candidate chunks in parallel, then merge sequentially per file. + DetailedStatistics file_detailed; + file_detailed.chunks_skipped = total_checkpoints - candidate_checkpoints.size(); - auto chunk_mutex = std::make_shared(); - - co_await fctx.scope([file_path, index_path, checkpoint_size, file_size, - num_ckpts, filter_names_ptr, filter_cats_ptr, - group_by_ptr, file_detailed, chunk_mutex, - candidates = std::move(candidate_checkpoints)]( - CoroScope& chunk_scope) -> coro::CoroTask { - for (auto ckpt_idx : candidates) { - chunk_scope.spawn( - [file_path, index_path, checkpoint_size, file_size, num_ckpts, - ckpt_idx, filter_names_ptr, filter_cats_ptr, group_by_ptr, - file_detailed, - chunk_mutex](CoroScope& /*cctx*/) -> coro::CoroTask { - co_return co_await scan_chunk_detailed( - file_path, index_path, checkpoint_size, file_size, - num_ckpts, ckpt_idx, filter_names_ptr, filter_cats_ptr, - group_by_ptr, file_detailed, chunk_mutex); - }); + std::vector candidates = std::move(candidate_checkpoints); + std::vector> chunk_results( + candidates.size()); + + const auto* file_path_ptr = &file_path; + const auto* index_path_ptr = &index_path; + auto* candidates_ptr = &candidates; + auto* chunk_results_ptr = &chunk_results; + co_await fctx.scope( + [file_path_ptr, index_path_ptr, checkpoint_size, file_size, num_ckpts, + filter_names_ptr, filter_cats_ptr, group_by_ptr, candidates_ptr, + chunk_results_ptr](CoroScope& chunk_scope) -> coro::CoroTask { + for (std::size_t result_idx = 0; + result_idx < candidates_ptr->size(); ++result_idx) { + std::uint64_t ckpt_idx = (*candidates_ptr)[result_idx]; + chunk_scope.spawn( + [file_path_ptr, index_path_ptr, checkpoint_size, file_size, + num_ckpts, ckpt_idx, filter_names_ptr, filter_cats_ptr, + group_by_ptr, chunk_results_ptr, + result_idx](CoroScope& /*cctx*/) -> coro::CoroTask { + (*chunk_results_ptr)[result_idx] = + co_await scan_chunk_detailed( + *file_path_ptr, *index_path_ptr, + checkpoint_size, file_size, num_ckpts, ckpt_idx, + filter_names_ptr, filter_cats_ptr, + group_by_ptr); + co_return; + }); + } + co_return; + }); + + for (const auto& chunk_result : chunk_results) { + if (chunk_result.has_value()) { + file_detailed.merge(*chunk_result); } - co_return; - }); + } // Hash resolution (sequential, all chunks done) std::unordered_map hash_resolutions; if (needs_hash_resolution && fs::exists(index_path)) { try { IndexDatabase idx_db(index_path); - auto logical = - utilities::indexer::internal::get_logical_path(file_path); - int file_info_id = idx_db.get_file_info_id(logical); - if (file_info_id >= 0) { - auto resolve_hashes = [&](const std::string& dim) { - for (const auto& [key, _] : - file_detailed->grouped_duration) { - if (hash_resolutions.count(key) == 0) { - auto resolved = - idx_db.query_resolved_by_hash(dim, key); - if (resolved.has_value()) { - hash_resolutions[key] = resolved.value(); - } + auto resolve_hashes = [&](IndexDatabase::HashType hash_type) { + for (const auto& [key, _] : file_detailed.grouped_duration) { + if (hash_resolutions.count(key) == 0) { + auto resolved = idx_db.resolve_hash(hash_type, key); + if (resolved.has_value()) { + hash_resolutions[key] = resolved.value(); } } - for (const auto& [key, _] : file_detailed->grouped_io) { - if (hash_resolutions.count(key) == 0) { - auto resolved = - idx_db.query_resolved_by_hash(dim, key); - if (resolved.has_value()) { - hash_resolutions[key] = resolved.value(); - } + } + for (const auto& [key, _] : file_detailed.grouped_io) { + if (hash_resolutions.count(key) == 0) { + auto resolved = idx_db.resolve_hash(hash_type, key); + if (resolved.has_value()) { + hash_resolutions[key] = resolved.value(); } } - }; + } + }; - for (const auto& dim : *group_by_ptr) { - if (dim == "fhash" || dim == "hhash") { - resolve_hashes(dim); - } + for (const auto& dim : *group_by_ptr) { + if (dim == "fhash") { + resolve_hashes(IndexDatabase::HashType::FILE); + } else if (dim == "hhash") { + resolve_hashes(IndexDatabase::HashType::HOST); } } } catch (const std::exception& e) { @@ -832,7 +913,7 @@ static coro::CoroTask process_file_detailed( // Output per-file results if (json_output) { - std::string detail_json = file_detailed->to_json(); + std::string detail_json = file_detailed.to_json(); std::string json_obj = std::string("{\"file_path\": \"") + file_path + "\", \"detailed\": " + detail_json + "}"; std::lock_guard lock(*output_mutex_ptr); @@ -840,14 +921,14 @@ static coro::CoroTask process_file_detailed( } else { std::lock_guard lock(*output_mutex_ptr); print_text_detailed( - file_path, *file_detailed, - file_detailed->chunks_scanned + file_detailed->chunks_skipped, - top_n, hash_resolutions); + file_path, file_detailed, + file_detailed.chunks_scanned + file_detailed.chunks_skipped, top_n, + hash_resolutions); } { std::lock_guard lock(*aggregate_mutex_ptr); - aggregate_detailed_ptr->merge(*file_detailed); + aggregate_detailed_ptr->merge(file_detailed); } co_return; @@ -855,16 +936,12 @@ static coro::CoroTask process_file_detailed( static void run_detailed_query_workers( CoroScope& scope, const std::vector* files_ptr, - const std::vector* small_files_ptr, - std::size_t executor_threads, std::string index_dir, + std::size_t executor_threads, const std::string* index_dir_ptr, std::size_t checkpoint_size, bool needs_hash_resolution, bool json_output, std::size_t top_n, const common::query::Query* qp, const std::vector* fn, const std::vector* fc, const std::vector* gb, DetailedStatistics* ad, std::mutex* am, std::mutex* om, std::vector>* jr) { - auto small_set = std::make_shared>( - small_files_ptr->begin(), small_files_ptr->end()); - auto file_chan = coro::make_channel(executor_threads * 2); scope.spawn([ch = file_chan->producer(), @@ -879,98 +956,122 @@ static void run_detailed_query_workers( }); for (std::size_t w = 0; w < executor_threads; ++w) { - scope.spawn([file_chan, files_ptr, index_dir, checkpoint_size, - needs_hash_resolution, json_output, top_n, small_set, qp, - fn, fc, gb, ad, am, om, + scope.spawn([ch = file_chan->consumer(), files_ptr, index_dir_ptr, + checkpoint_size, needs_hash_resolution, json_output, top_n, + qp, fn, fc, gb, ad, am, om, jr](CoroScope& fctx) -> coro::CoroTask { - while (auto fi_opt = co_await file_chan->receive()) { + while (auto fi_opt = co_await ch.receive()) { std::size_t fi = *fi_opt; - const auto& file_path = (*files_ptr)[fi]; - bool is_small = small_set->count(file_path) > 0; - - if (is_small) { - auto stats = co_await direct_scan_detailed_statistics( - file_path, fn, fc, gb); - { - std::lock_guard lock(*am); - ad->merge(stats); - } - continue; - } + std::string file_path = (*files_ptr)[fi]; co_await process_file_detailed( - fctx, file_path, fi, index_dir, checkpoint_size, - needs_hash_resolution, json_output, top_n, qp, fn, fc, gb, - ad, am, om, jr); + fctx, std::move(file_path), fi, *index_dir_ptr, + checkpoint_size, needs_hash_resolution, json_output, top_n, + qp, fn, fc, gb, ad, am, om, jr); } co_return; }); } } -static coro::CoroTask run_stats(argparse::ArgumentParser& program) { - std::string directory = program.get("--directory"); - std::string index_dir = program.get("--index-dir"); - bool json_output = program.get("--json"); - std::string report_str = program.get("--report"); - std::uint64_t top_n = program.get("--top-n"); - std::uint64_t top_n_pid_tid = program.get("--top-n-pid-tid"); - bool no_auto_index = program.get("--no-auto-index"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - std::size_t executor_threads = - program.get("--executor-threads"); - auto query_str = program.get("--query"); - auto group_by = program.get>("--group-by"); - - std::optional query; - std::vector filter_names; - std::vector filter_cats; - - if (!query_str.empty()) { - auto result = Query::from_string(query_str); - if (!result) { - DFTRACER_UTILS_LOG_ERROR("Invalid --query: %s", - result.error().format().c_str()); - co_return 1; - } - query = std::move(*result); - } - - auto report_type = parse_report_type_str(report_str); - - // Default --group-by to "name" for detailed query so users always see - // per-event breakdowns (not just global I/O aggregates) - if (report_type == StatisticsQueryType::DETAILED && group_by.empty()) { - group_by.push_back("name"); - } +static void print_text_query_output(const TraceStatistics& stats, + const StatisticsQueryOutput& output) { + std::printf("========================================\n"); + std::printf("File: %s\n", stats.file_path.c_str()); + std::printf("========================================\n"); + std::printf(" Chunks: %llu\n", (unsigned long long)stats.num_chunks); + std::printf(" Events Scanned: %llu\n", + (unsigned long long)output.total_events); + + switch ( + output.query_type_name == "categories" ? StatisticsQueryType::CATEGORIES + : output.query_type_name == "names" ? StatisticsQueryType::NAMES + : output.query_type_name == "pid_tids" ? StatisticsQueryType::PID_TIDS + : output.query_type_name == "time_range" + ? StatisticsQueryType::TIME_RANGE + : output.query_type_name == "duration_stats" + ? StatisticsQueryType::DURATION_STATS + : output.query_type_name == "top_n_names" + ? StatisticsQueryType::TOP_N_NAMES + : output.query_type_name == "top_n_categories" + ? StatisticsQueryType::TOP_N_CATEGORIES + : StatisticsQueryType::SUMMARY) { + case StatisticsQueryType::CATEGORIES: + case StatisticsQueryType::TOP_N_CATEGORIES: + std::printf("\n Categories (%zu):\n", output.results.size()); + for (const auto& [name, count] : output.results) { + std::printf(" %-40s %llu\n", name.c_str(), + (unsigned long long)count); + } + break; + + case StatisticsQueryType::NAMES: + case StatisticsQueryType::TOP_N_NAMES: + std::printf("\n Names (%zu):\n", output.results.size()); + for (const auto& [name, count] : output.results) { + std::printf(" %-40s %llu\n", name.c_str(), + (unsigned long long)count); + } + break; - // Validate group-by dimensions - const std::vector valid_dims = { - "name", "cat", "pid", "tid", "fhash", "hhash", "pid_tid"}; - for (const auto& dim : group_by) { - if (std::find(valid_dims.begin(), valid_dims.end(), dim) == - valid_dims.end()) { - DFTRACER_UTILS_LOG_ERROR( - "Invalid --group-by dimension: %s. Valid: name, cat, pid, " - "tid, fhash, hhash, pid_tid", - dim.c_str()); - co_return 1; - } + case StatisticsQueryType::PID_TIDS: + std::printf("\n Process/Thread Pairs (%zu):\n", + output.results.size()); + for (const auto& [name, count] : output.results) { + std::printf(" %-40s %llu\n", name.c_str(), + (unsigned long long)count); + } + break; + + case StatisticsQueryType::TIME_RANGE: + std::printf("\n Time Span: %.6f seconds\n", + output.time_span_seconds); + std::printf(" Min Timestamp: %llu us\n", + (unsigned long long)output.min_timestamp_us); + std::printf(" Max Timestamp: %llu us\n", + (unsigned long long)output.max_timestamp_us); + break; + + case StatisticsQueryType::DURATION_STATS: + std::printf("\n Duration (all events):\n"); + std::printf(" Count: %llu Mean: %.1f us Stddev: %.1f us\n", + (unsigned long long)output.duration_count, + output.duration_mean_us, output.duration_stddev_us); + std::printf(" Min: %llu us Max: %llu us\n", + (unsigned long long)output.duration_min_us, + (unsigned long long)output.duration_max_us); + break; + + case StatisticsQueryType::SUMMARY: + case StatisticsQueryType::DETAILED: + break; } +} - // Collect files - std::vector files; +static coro::CoroTask> collect_files( + CoroScope& ctx, const std::vector& cli_files, + std::string directory) { if (!directory.empty()) { if (!fs::exists(directory)) { DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s", directory.c_str()); - co_return 1; + co_return std::vector{}; } - PatternDirectoryScannerUtility scanner; + auto scanner = std::make_shared(); PatternDirectoryScannerUtilityInput scan_input{ - directory, {".pfw", ".pfw.gz"}, false}; - auto matched = co_await scanner.process(scan_input); - + directory, {".pfw", ".pfw.gz"}, false, false}; + utilities::behaviors::BehaviorChain> + chain; + utilities::behaviors::UtilityExecutor< + PatternDirectoryScannerUtilityInput, + std::vector, utilities::tags::Parallelizable, + utilities::tags::NeedsContext> + executor(scanner, std::move(chain)); + auto matched = co_await executor.execute_with_context(ctx, scan_input); + + std::vector files; + files.reserve(matched.size()); for (const auto& entry : matched) { files.push_back(entry.path.string()); } @@ -978,357 +1079,469 @@ static coro::CoroTask run_stats(argparse::ArgumentParser& program) { if (files.empty()) { DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s", directory.c_str()); - co_return 1; } - } else { - files = program.get>("--files"); + co_return files; + } - if (files.empty()) { - DFTRACER_UTILS_LOG_ERROR( - "%s", "No files or directory specified. Use --help for usage."); - std::cerr << program; - co_return 1; - } + if (cli_files.empty()) { + DFTRACER_UTILS_LOG_ERROR( + "%s", "No files or directory specified. Use --help for usage."); } + co_return cli_files; +} - // Partition files: large files get indexed, small files are scanned - // directly to avoid creating sidecar files on metadata-sensitive - // filesystems (e.g. Lustre). - std::vector files_needing_index; - std::vector small_files; - for (const auto& file_path : files) { - std::string index_path = - internal::determine_index_path(file_path, index_dir); - if (fs::exists(index_path)) { - try { - IndexDatabase db(index_path); - auto logical = - utilities::indexer::internal::get_logical_path(file_path); - int fid = db.get_file_info_id(logical); - if (fid >= 0 && db.has_bloom_data(fid)) continue; - } catch (...) { - } - } - std::error_code ec; - auto fsize = fs::file_size(file_path, ec); - if (ec || fsize == 0) { - continue; // skip unreadable or empty files - } - if (fsize < INDEX_SIZE_THRESHOLD) { - small_files.push_back(file_path); +static std::unique_ptr load_index_root_snapshot_impl( + const std::string& index_path) { + auto snapshot = std::make_unique(); + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + + auto registry = db.query_all_file_registry(); + + snapshot->logical_files.reserve(registry.size()); + snapshot->partition.indexed_entries.reserve(registry.size()); + + std::size_t file_index = 0; + for (auto& [logical_path, reg] : registry) { + snapshot->logical_files.push_back(logical_path); + const bool has_summary = has_capability( + reg.capabilities, IndexFileEntryCapability::FILE_SUMMARY); + if (!has_summary) { + append_failed_stats_result( + snapshot->partition.precomputed_failures, file_index, + logical_path, + "File registry entry exists but no file summary data was " + "found in the shared index"); } else { - files_needing_index.push_back(file_path); + snapshot->partition.indexed_entries.push_back(ResolvedFile{ + file_index, logical_path, reg.file_id, reg.capabilities}); } + ++file_index; } + snapshot->partition.resolver_result.index_path = index_path; + return snapshot; +} - if (!small_files.empty()) { - std::printf( - "Skipping index for %zu small file(s) (< %zu bytes " - "compressed); will scan directly.\n", - small_files.size(), INDEX_SIZE_THRESHOLD); - } +static coro::CoroTask> +load_index_root_snapshot(std::string index_path) { + co_return load_index_root_snapshot_impl(index_path); +} - if (!files_needing_index.empty()) { - if (no_auto_index) { - DFTRACER_UTILS_LOG_ERROR( - "Missing index for %zu file(s) and --no-auto-index is " - "set. Run dftracer_index first.", - files_needing_index.size()); - for (const auto& f : files_needing_index) { - std::fprintf(stderr, " Missing index: %s\n", f.c_str()); - } - co_return 1; - } +static std::unique_ptr load_root_aggregate_impl( + const std::string& index_path, StatisticsQueryType report_type) { + IndexDatabase idx_db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto scalar_stats = idx_db.query_root_scalar_stats(); + if (!scalar_stats) { + return nullptr; + } - std::printf("Auto-building index for %zu file(s)...\n", - files_needing_index.size()); - - auto pipeline_config = PipelineConfig() - .with_name("DFTracer Stats Auto-Indexer") - .with_compute_threads(executor_threads) - .with_watchdog(false); - - Pipeline pipeline(pipeline_config); - - std::atomic indexed_count{0}; - std::atomic failed_count{0}; - - auto index_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - auto file_chan = - coro::make_channel(executor_threads * 2); - - co_await ctx.scope([&](CoroScope& scope) - -> coro::CoroTask { - auto* files_ptr = &files_needing_index; - scope.spawn( - [ch = file_chan->producer(), files_ptr]( - CoroScope&) mutable -> coro::CoroTask { - auto guard = ch.guard(); - for (const auto& f : *files_ptr) { - if (!co_await ch.send(f)) { - co_return; - } - } - co_return; - }); - - auto* indexed_count_ptr = &indexed_count; - auto* failed_count_ptr = &failed_count; - std::string index_dir_copy = index_dir; - std::size_t ckpt_size = checkpoint_size; - for (std::size_t w = 0; w < executor_threads; ++w) { - scope.spawn([file_chan, index_dir_copy, ckpt_size, - indexed_count_ptr, failed_count_ptr]( - CoroScope&) -> coro::CoroTask { - while (auto file_path = - co_await file_chan->receive()) { - try { - IndexBuilderUtility builder; - auto config = - IndexBuildConfig::for_file(*file_path) - .with_index_dir(index_dir_copy) - .with_checkpoint_size(ckpt_size) - .with_bloom(true) - .with_index_threshold(0); - auto result = - co_await builder.process(config); - - if (result.success) { - (*indexed_count_ptr)++; - } else { - (*failed_count_ptr)++; - DFTRACER_UTILS_LOG_ERROR( - "Auto-indexing failed " - "for %s: %s", - file_path->c_str(), - result.error_message.c_str()); - } - } catch (const std::exception& e) { - (*failed_count_ptr)++; - DFTRACER_UTILS_LOG_ERROR( - "Auto-indexing exception " - "for %s: %s", - file_path->c_str(), e.what()); - } - } - co_return; - }); - } - co_return; - }); + auto agg = std::make_unique(); + agg->total.success = true; + agg->total.file_path = index_path; + agg->total.index_path = index_path; + agg->total.num_chunks = scalar_stats->num_chunks; + agg->total.merged = scalar_stats->stats; + + const bool needs_categories = + report_type == StatisticsQueryType::SUMMARY || + report_type == StatisticsQueryType::CATEGORIES || + report_type == StatisticsQueryType::TOP_N_CATEGORIES; + const bool needs_names = report_type == StatisticsQueryType::NAMES || + report_type == StatisticsQueryType::TOP_N_NAMES; + const bool needs_pid_tids = report_type == StatisticsQueryType::SUMMARY || + report_type == StatisticsQueryType::PID_TIDS; + + if (needs_categories) { + idx_db.merge_root_category_counts_into(agg->total.merged); + } + if (needs_names) { + idx_db.merge_root_name_counts_into(agg->total.merged); + } + if (needs_pid_tids) { + idx_db.merge_root_pid_tid_counts_into(agg->total.merged); + } - co_return; - }, - "AutoIndex"); + agg->successful_count = static_cast(scalar_stats->num_files); + return agg; +} - pipeline.set_source(index_task); - pipeline.set_destination(index_task); - pipeline.execute(); +static coro::CoroTask> +load_root_aggregate_result(std::string index_path, + StatisticsQueryType report_type) { + co_return load_root_aggregate_impl(index_path, report_type); +} - std::printf("Auto-indexing complete: %zu indexed, %zu failed\n", - indexed_count.load(), failed_count.load()); +static IndexPartition build_partition(ResolverResult result) { + IndexPartition partition; + partition.files_needing_index = std::move(result.needs_checkpoint); + partition.indexed_entries = std::move(result.cached); + partition.resolver_result = std::move(result); + + // Files that have checkpoints but no bloom get empty stats + for (const auto& entry : partition.resolver_result.needs_bloom) { + append_empty_indexed_stats_result(partition.precomputed_successes, + entry.file_index, entry.file_path, + partition.resolver_result.index_path); } - auto start_time = std::chrono::high_resolution_clock::now(); + return partition; +} - // Detailed query path: scan chunks on-demand with bloom pre-filtering - if (report_type == StatisticsQueryType::DETAILED) { - bool needs_hash_resolution = false; - for (const auto& dim : group_by) { - if (dim == "fhash" || dim == "hhash") { - needs_hash_resolution = true; - break; +static coro::CoroTask resolve_index_state( + std::vector files, std::string index_dir, + StatisticsQueryType report_type) { + IndexResolverUtility resolver; + ResolverInput input; + input.files = std::move(files); + input.index_dir = std::move(index_dir); + input.require_bloom = report_type == StatisticsQueryType::DETAILED; + auto result = co_await resolver.process(input); + co_return build_partition(std::move(result)); +} + +static coro::CoroTask run_batch_build( + CoroScope* ctx, std::shared_ptr config) { + co_return co_await indexer::IndexBatchBuilderUtility::process( + ctx, std::move(config)); +} + +static coro::CoroTask auto_index_files(CoroScope& ctx, + IndexPartition& partition, + const std::string& index_dir, + std::size_t checkpoint_size, + std::size_t executor_threads) { + auto index_path = internal::determine_index_path( + partition.files_needing_index.front().file_path, index_dir); + dftracer::utils::rocksdb::RocksDBManager::instance().reset(index_path); + + std::printf("Auto-building index for %zu file(s)...\n", + partition.files_needing_index.size()); + + const bool all_gzip = std::all_of( + partition.files_needing_index.begin(), + partition.files_needing_index.end(), [](const FileWorkItem& item) { + return item.file_path.ends_with(".gz"); + }); + + { + auto batch_config = std::make_shared(); + batch_config->file_paths.reserve(partition.files_needing_index.size()); + for (const auto& item : partition.files_needing_index) { + batch_config->file_paths.push_back(item.file_path); + } + batch_config->index_dir = index_dir; + batch_config->checkpoint_size = checkpoint_size; + batch_config->parallelism = executor_threads; + + batch_config->use_batch_write = all_gzip; + batch_config->rebuild_root_summaries = all_gzip; + + auto batch_result = + co_await run_batch_build(&ctx, std::move(batch_config)); + + DFTRACER_UTILS_LOG_INFO( + "Shared root auto-index metrics: root=%s files=%zu " + "enqueued=%zu parsed=%zu written=%zu " + "parse=%.2fms writer_db=%.2fms", + index_path.c_str(), partition.files_needing_index.size(), + batch_result.metrics.files_enqueued, + batch_result.metrics.files_parsed, + batch_result.metrics.files_written, + ns_to_ms(batch_result.metrics.parse_ns), + ns_to_ms(batch_result.metrics.write_ns)); + + for (const auto& result : batch_result.results) { + if (!result.success && !result.error_message.empty()) { + DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s", + result.file_path.c_str(), + result.error_message.c_str()); } } - DetailedStatistics aggregate_detailed; - std::mutex aggregate_mutex; - std::mutex output_mutex; - std::vector> json_results; - - { - auto pipeline_config = PipelineConfig() - .with_name("DFTracer Stats Detailed") - .with_compute_threads(executor_threads) - .with_watchdog(false); - - Pipeline pipeline(pipeline_config); - - auto stats_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - co_await ctx.scope( - [&](CoroScope& scope) -> coro::CoroTask { - run_detailed_query_workers( - scope, &files, &small_files, executor_threads, - index_dir, checkpoint_size, - needs_hash_resolution, json_output, top_n, - query ? &*query : nullptr, &filter_names, - &filter_cats, &group_by, &aggregate_detailed, - &aggregate_mutex, &output_mutex, &json_results); - co_return; - }); - co_return; - }, - "StatsDetailed"); - - pipeline.set_source(stats_task); - pipeline.set_destination(stats_task); - pipeline.execute(); - } + std::printf("Auto-indexing complete: %zu indexed, %zu failed\n", + batch_result.indexed, batch_result.failed); + } - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = - end_time - start_time; - - if (json_output) { - std::printf("[\n"); - std::sort( - json_results.begin(), json_results.end(), - [](const auto& a, const auto& b) { return a.first < b.first; }); - for (std::size_t i = 0; i < json_results.size(); ++i) { - std::printf("%s%s", json_results[i].second.c_str(), - i + 1 < json_results.size() ? ",\n" : "\n"); - } - std::printf("]\n"); + // Re-resolve newly indexed files + std::vector newly_indexed; + newly_indexed.reserve(partition.files_needing_index.size()); + for (const auto& item : partition.files_needing_index) { + newly_indexed.push_back(item.file_path); + } + + IndexResolverUtility resolver; + ResolverInput refresh_input; + refresh_input.files = std::move(newly_indexed); + refresh_input.index_dir = index_dir; + refresh_input.require_checkpoints = true; + + auto refresh_result = co_await resolver.process(refresh_input); + + // Add successfully indexed files + for (auto& entry : refresh_result.cached) { + bool has_bloom = indexer::has_capability( + entry.capabilities, indexer::IndexFileEntryCapability::BLOOM); + if (has_bloom) { + partition.indexed_entries.push_back(std::move(entry)); } else { - std::printf("==========================================\n"); - std::printf("Consolidated Detailed (%zu files)\n", files.size()); - std::printf("==========================================\n"); - std::unordered_map no_resolutions; - print_text_detailed(directory, aggregate_detailed, - aggregate_detailed.chunks_scanned + - aggregate_detailed.chunks_skipped, - top_n, no_resolutions); - std::printf(" Processing Time: %.2f ms\n", duration.count()); - std::printf("==========================================\n"); + append_empty_indexed_stats_result(partition.precomputed_successes, + entry.file_index, entry.file_path, + refresh_result.index_path); } + } - co_return 0; + // Handle files that still need checkpoints (failed to index) + for (const auto& item : refresh_result.needs_checkpoint) { + append_failed_stats_result( + partition.precomputed_failures, item.file_index, item.file_path, + "Auto-index completed but no readable file summary " + "data was found in the shared index"); } - // Non-detailed path: aggregate statistics per file in parallel - std::vector> indexed_stats; - std::mutex stats_mutex; + // Update resolver result + partition.resolver_result.index_path = refresh_result.index_path; - { - auto pipeline_config = PipelineConfig() - .with_name("DFTracer Stats") - .with_compute_threads(executor_threads) - .with_watchdog(false); - - Pipeline pipeline(pipeline_config); - - auto stats_task = make_task( - [&](CoroScope& ctx) -> coro::CoroTask { - co_await ctx.scope([&](CoroScope& scope) - -> coro::CoroTask { - auto* indexed_stats_ptr = &indexed_stats; - auto* stats_mutex_ptr = &stats_mutex; - auto* files_ptr = &files; - - // Build set of small files for O(1) lookup. - // shared_ptr so workers keep it alive after this - // scope lambda's coroutine frame is destroyed. - auto small_set = - std::make_shared>( - small_files.begin(), small_files.end()); - - auto file_chan = - coro::make_channel(executor_threads * 2); - - // Producer: push file indices - scope.spawn( - [ch = file_chan->producer(), files_ptr]( - CoroScope&) mutable -> coro::CoroTask { - auto guard = ch.guard(); - for (std::size_t fi = 0; fi < files_ptr->size(); - ++fi) { - if (!co_await ch.send(fi)) { - co_return; - } - } - co_return; - }); - - // Workers: N coroutines, each processing one file at a time - for (std::size_t w = 0; w < executor_threads; ++w) { - scope.spawn([file_chan, files_ptr, index_dir, small_set, - indexed_stats_ptr, stats_mutex_ptr]( - CoroScope&) -> coro::CoroTask { - while (auto fi_opt = - co_await file_chan->receive()) { - std::size_t fi = *fi_opt; - const auto& file_path = (*files_ptr)[fi]; - bool is_small = small_set->count(file_path) > 0; - - TraceStatistics result; - if (is_small) { - result = - co_await direct_scan_trace_statistics( - file_path); - } else { - StatisticsAggregatorInput agg_input; - agg_input.file_path = file_path; - agg_input.index_dir = index_dir; - - StatisticsAggregatorUtility aggregator; - result = - co_await aggregator.process(agg_input); - } - - std::lock_guard lock( - *stats_mutex_ptr); - indexed_stats_ptr->emplace_back( - fi, std::move(result)); - } - co_return; - }); - } - co_return; - }); + co_return; +} + +static coro::CoroTask run_detailed_stats( + CoroScope& ctx, const StatsConfig* config_ptr, + const std::vector* files_ptr) { + auto start_time = std::chrono::high_resolution_clock::now(); + + bool needs_hash_resolution = false; + for (const auto& dim : config_ptr->group_by) { + if (dim == "fhash" || dim == "hhash") { + needs_hash_resolution = true; + break; + } + } + + auto aggregate_detailed = std::make_unique(); + std::mutex aggregate_mutex; + std::mutex output_mutex; + auto json_results = + std::make_unique>>(); + + auto* filter_names_ptr = &config_ptr->filter_names; + auto* filter_cats_ptr = &config_ptr->filter_cats; + auto* group_by_ptr = &config_ptr->group_by; + auto* aggregate_detailed_ptr = aggregate_detailed.get(); + auto* aggregate_mutex_ptr = &aggregate_mutex; + auto* output_mutex_ptr = &output_mutex; + auto* json_results_ptr = json_results.get(); + auto* query_ptr = config_ptr->query ? &*config_ptr->query : nullptr; + const auto* index_dir_for_detailed_ptr = &config_ptr->index_dir; + std::size_t checkpoint_size_for_detailed = config_ptr->checkpoint_size; + std::size_t executor_threads_for_detailed = config_ptr->executor_threads; + bool needs_hash_resolution_for_detailed = needs_hash_resolution; + bool json_output_for_detailed = config_ptr->json_output; + std::size_t top_n_for_detailed = config_ptr->top_n; + + co_await ctx.scope( + [files_ptr, executor_threads_for_detailed, index_dir_for_detailed_ptr, + checkpoint_size_for_detailed, needs_hash_resolution_for_detailed, + json_output_for_detailed, top_n_for_detailed, query_ptr, + filter_names_ptr, filter_cats_ptr, group_by_ptr, + aggregate_detailed_ptr, aggregate_mutex_ptr, output_mutex_ptr, + json_results_ptr](CoroScope& scope) -> coro::CoroTask { + run_detailed_query_workers( + scope, files_ptr, executor_threads_for_detailed, + index_dir_for_detailed_ptr, checkpoint_size_for_detailed, + needs_hash_resolution_for_detailed, json_output_for_detailed, + top_n_for_detailed, query_ptr, filter_names_ptr, + filter_cats_ptr, group_by_ptr, aggregate_detailed_ptr, + aggregate_mutex_ptr, output_mutex_ptr, json_results_ptr); + co_return; + }); + + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end_time - start_time; + if (config_ptr->json_output) { + std::printf("[\n"); + std::sort( + json_results->begin(), json_results->end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + for (std::size_t i = 0; i < json_results->size(); ++i) { + std::printf("%s%s", (*json_results)[i].second.c_str(), + i + 1 < json_results->size() ? ",\n" : "\n"); + } + std::printf("]\n"); + } else { + std::printf("==========================================\n"); + std::printf("Consolidated Detailed (%zu files)\n", files_ptr->size()); + std::printf("==========================================\n"); + std::unordered_map no_resolutions; + print_text_detailed(config_ptr->directory, *aggregate_detailed, + aggregate_detailed->chunks_scanned + + aggregate_detailed->chunks_skipped, + config_ptr->top_n, no_resolutions); + std::printf(" Processing Time: %.2f ms\n", duration.count()); + std::printf("==========================================\n"); + } + + co_return 0; +} + +static coro::CoroTask process_index_group( + const std::string* index_path_ptr, + const std::vector* group_ptr, + std::vector>* indexed_stats_ptr, + std::mutex* stats_mutex_ptr, std::size_t expected_indexed_files, + bool needs_per_file_results, TraceStatistics* total_ptr, + std::mutex* total_mutex_ptr, std::atomic* successful_ptr, + std::atomic* failed_ptr, + StatisticsQueryType report_type_for_reader, Timer* metrics_timer_ptr) { + try { + metrics_timer_ptr->increment("root_summary_attempts"); + if (!needs_per_file_results) { + auto root_summary = co_await process_index_group_root_summary( + *index_path_ptr, expected_indexed_files, + report_type_for_reader); + if (root_summary && root_summary->success) { + metrics_timer_ptr->increment("root_summary_hits"); + std::lock_guard lock(*total_mutex_ptr); + total_ptr->merged.merge_from(root_summary->merged); + total_ptr->num_chunks += root_summary->num_chunks; + successful_ptr->fetch_add(group_ptr->size(), + std::memory_order_relaxed); co_return; - }, - "StatsProcess"); + } + } + metrics_timer_ptr->increment("root_summary_misses"); + metrics_timer_ptr->increment("fallback_groups"); + metrics_timer_ptr->increment("fallback_files", group_ptr->size()); + + SharedIndexStatisticsReader reader; + auto batch_rows = co_await reader.query(*index_path_ptr, *group_ptr, + report_type_for_reader); + auto callback = [indexed_stats_ptr, stats_mutex_ptr, + needs_per_file_results, total_ptr, total_mutex_ptr, + successful_ptr, failed_ptr](std::size_t file_index, + TraceStatistics&& stats) { + if (needs_per_file_results) { + std::lock_guard lock(*stats_mutex_ptr); + indexed_stats_ptr->emplace_back(file_index, std::move(stats)); + return; + } - pipeline.set_source(stats_task); - pipeline.set_destination(stats_task); - pipeline.execute(); + if (stats.success) { + std::lock_guard lock(*total_mutex_ptr); + total_ptr->merged.merge_from(stats.merged); + total_ptr->num_chunks += stats.num_chunks; + successful_ptr->fetch_add(1, std::memory_order_relaxed); + } else { + failed_ptr->fetch_add(1, std::memory_order_relaxed); + } + }; + SharedIndexStatisticsReader::process_batch_results(batch_rows, + callback); + } catch (const std::exception& e) { + DFTRACER_UTILS_LOG_ERROR("Indexed stats batch failed for %s: %s", + index_path_ptr->c_str(), e.what()); + if (needs_per_file_results) { + std::lock_guard lock(*stats_mutex_ptr); + for (const auto& entry : *group_ptr) { + TraceStatistics failed_result; + failed_result.file_path = entry.file_path; + failed_result.success = false; + failed_result.error_message = e.what(); + indexed_stats_ptr->emplace_back(entry.file_index, + std::move(failed_result)); + } + } + failed_ptr->fetch_add(group_ptr->size(), std::memory_order_relaxed); } + co_return; +} - // Restore original file order - std::sort(indexed_stats.begin(), indexed_stats.end(), - [](const auto& a, const auto& b) { return a.first < b.first; }); +static coro::CoroTask run_aggregate_stats( + CoroScope& /*ctx*/, const StatsConfig* config_ptr, + std::unique_ptr partition_ptr) { + auto& partition = *partition_ptr; + auto agg = std::make_unique(); + agg->total.success = true; + agg->total.file_path = config_ptr->directory; + + const bool needs_per_file_results = config_ptr->json_output; + std::atomic successful{0}; + std::atomic failed{0}; + + if (!partition.precomputed_failures.empty()) { + if (needs_per_file_results) { + agg->indexed_stats.insert( + agg->indexed_stats.end(), + std::make_move_iterator(partition.precomputed_failures.begin()), + std::make_move_iterator(partition.precomputed_failures.end())); + } + failed.fetch_add(partition.precomputed_failures.size(), + std::memory_order_relaxed); + } + if (!partition.precomputed_successes.empty()) { + if (needs_per_file_results) { + agg->indexed_stats.insert( + agg->indexed_stats.end(), + std::make_move_iterator( + partition.precomputed_successes.begin()), + std::make_move_iterator(partition.precomputed_successes.end())); + } + successful.fetch_add(partition.precomputed_successes.size(), + std::memory_order_relaxed); + } - std::vector all_stats; - all_stats.reserve(indexed_stats.size()); - for (auto& [_, stats] : indexed_stats) { - all_stats.push_back(std::move(stats)); + std::mutex stats_mutex; + std::mutex total_mutex; + Timer read_timer("stats_read_path", true, false); + + auto* indexed_stats_ptr = &agg->indexed_stats; + auto* stats_mutex_ptr = &stats_mutex; + auto* indexed_entries_ptr = &partition.indexed_entries; + const auto* index_path_ptr = &partition.resolver_result.index_path; + auto* total_ptr = &agg->total; + auto* total_mutex_ptr = &total_mutex; + auto* successful_ptr = &successful; + auto* failed_ptr = &failed; + auto* read_timer_ptr = &read_timer; + StatisticsQueryType report_type_for_reader = config_ptr->report_type; + + if (!indexed_entries_ptr->empty()) { + const auto expected_indexed_files = indexed_entries_ptr->size(); + co_await process_index_group( + index_path_ptr, indexed_entries_ptr, indexed_stats_ptr, + stats_mutex_ptr, expected_indexed_files, needs_per_file_results, + total_ptr, total_mutex_ptr, successful_ptr, failed_ptr, + report_type_for_reader, read_timer_ptr); } + read_timer.stop(); - // Merge all per-file stats into a single consolidated result - TraceStatistics total; - total.success = true; - total.file_path = directory; - std::size_t successful = 0; - std::size_t failed = 0; - - for (const auto& stats : all_stats) { - if (stats.success) { - total.merged.merge_from(stats.merged); - total.num_chunks += stats.num_chunks; - successful++; - } else { - failed++; + agg->successful_count = successful.load(std::memory_order_relaxed); + agg->failed_count = failed.load(std::memory_order_relaxed); + agg->read_elapsed_ns = read_timer.elapsed(); + agg->read_counters = read_timer.counters(); + co_return std::move(*agg); +} + +static coro::CoroTask output_aggregate_stats( + const StatsConfig* config_ptr, const std::vector* files_ptr, + std::unique_ptr agg, Timer overall) { + std::vector all_stats; + if (config_ptr->json_output) { + std::sort( + agg->indexed_stats.begin(), agg->indexed_stats.end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + all_stats.reserve(agg->indexed_stats.size()); + for (auto& [_, stats] : agg->indexed_stats) { + all_stats.push_back(std::move(stats)); } } - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = end_time - start_time; + double duration_ms = static_cast(overall.elapsed()) / 1e6; - if (json_output) { - // For JSON, output per-file results + if (config_ptr->json_output) { StatisticsQueryUtility query_util; std::printf("[\n"); for (std::size_t i = 0; i < all_stats.size(); ++i) { @@ -1340,33 +1553,178 @@ static coro::CoroTask run_stats(argparse::ArgumentParser& program) { } StatisticsQueryInput qi; qi.stats = stats; - qi.query_type = report_type; - qi.top_n = top_n; + qi.query_type = config_ptr->report_type; + qi.top_n = config_ptr->top_n; auto output = co_await query_util.process(qi); std::printf("%s%s", output.to_json().c_str(), i + 1 < all_stats.size() ? ",\n" : "\n"); } std::printf("]\n"); } else { - // Text output: print consolidated summary + const auto displayed_files = + files_ptr->empty() ? agg->successful_count + agg->failed_count + : files_ptr->size(); std::printf("==========================================\n"); std::printf("Consolidated (%zu files, %zu successful, %zu failed)\n", - files.size(), successful, failed); + displayed_files, agg->successful_count, agg->failed_count); std::printf("==========================================\n"); - auto detailed = to_detailed(total); - std::unordered_map no_resolutions; - print_text_detailed(total.file_path, detailed, total.num_chunks, top_n, - no_resolutions, &total, top_n_pid_tid); - std::printf(" Processing Time: %.2f ms\n", duration.count()); + if (config_ptr->report_type == StatisticsQueryType::SUMMARY) { + auto detailed = to_detailed(agg->total); + std::unordered_map no_resolutions; + print_text_detailed(agg->total.file_path, detailed, + agg->total.num_chunks, config_ptr->top_n, + no_resolutions, &agg->total, + config_ptr->top_n_pid_tid); + } else { + StatisticsQueryUtility query_util; + StatisticsQueryInput qi; + qi.stats = agg->total; + qi.query_type = config_ptr->report_type; + qi.top_n = config_ptr->top_n; + auto output = co_await query_util.process(qi); + print_text_query_output(agg->total, output); + } + auto counter = [&agg](const char* key) -> std::uint64_t { + auto it = agg->read_counters.find(key); + return it == agg->read_counters.end() ? 0 : it->second; + }; + DFTRACER_UTILS_LOG_INFO( + "Stats read metrics: report=%d elapsed=%.2fms " + "root_attempts=%" PRIu64 " root_hits=%" PRIu64 + " root_misses=%" PRIu64 " fallback_groups=%" PRIu64 + " fallback_files=%" PRIu64, + static_cast(config_ptr->report_type), + static_cast(agg->read_elapsed_ns) / 1'000'000.0, + counter("root_summary_attempts"), counter("root_summary_hits"), + counter("root_summary_misses"), counter("fallback_groups"), + counter("fallback_files")); + std::printf(" Processing Time: %.2f ms\n", duration_ms); std::printf("==========================================\n"); } co_return 0; } +static coro::CoroTask run_stats(CoroScope& ctx, + const StatsArgParse* args) { + StatsConfig config; + if (!args->to_config(config)) { + co_return 1; + } + + Timer stages_storage("dftracer_stats"); + Timer* stages = args->pipeline.time_profiling ? &stages_storage : nullptr; + Timer overall(true); + + std::vector files; + IndexPartition partition; + bool used_index_source_of_truth = false; + std::unique_ptr direct_root_aggregate; + + { + ScopedTimer _t(stages, "collect_and_classify"); + if (!config.directory.empty() && + config.report_type != StatisticsQueryType::DETAILED) { + auto trusted_index_path = internal::determine_index_path( + config.directory, config.index_dir); + const bool trusted_index_exists = fs::exists(trusted_index_path); + DFTRACER_UTILS_LOG_DEBUG( + "Stats direct-index decision: directory=%s index_path=%s " + "exists=%d json=%d report=%d", + config.directory.c_str(), trusted_index_path.c_str(), + trusted_index_exists ? 1 : 0, config.json_output ? 1 : 0, + static_cast(config.report_type)); + if (trusted_index_exists) { + { + ScopedTimer _ra(stages, "root_aggregate_read"); + if (!config.json_output) { + direct_root_aggregate = + co_await load_root_aggregate_result( + trusted_index_path, config.report_type); + DFTRACER_UTILS_LOG_DEBUG( + "Stats direct-index aggregate: index_path=%s " + "hit=%d", + trusted_index_path.c_str(), + direct_root_aggregate ? 1 : 0); + } + } + if (direct_root_aggregate) { + used_index_source_of_truth = true; + } else { + ScopedTimer _ls(stages, "load_index_snapshot"); + auto snapshot = + co_await load_index_root_snapshot(trusted_index_path); + files = std::move(snapshot->logical_files); + partition = std::move(snapshot->partition); + used_index_source_of_truth = true; + } + } + } + + if (!used_index_source_of_truth) { + { + ScopedTimer _cf(stages, "collect_files"); + files = co_await collect_files(ctx, args->files_args.value, + config.directory); + } + if (files.empty()) { + co_return 1; + } + { + ScopedTimer _ri(stages, "resolve_index_state"); + partition = co_await resolve_index_state( + files, config.index_dir, config.report_type); + } + } + } + + if (!partition.files_needing_index.empty()) { + if (config.no_auto_index) { + DFTRACER_UTILS_LOG_ERROR( + "Missing index for %zu file(s) and --no-auto-index is " + "set. Run dftracer_index first.", + partition.files_needing_index.size()); + for (const auto& f : partition.files_needing_index) { + std::fprintf(stderr, " Missing index: %s\n", + f.file_path.c_str()); + } + co_return 1; + } + ScopedTimer _ai(stages, "auto_index_files"); + co_await auto_index_files(ctx, partition, config.index_dir, + config.checkpoint_size, + config.executor_threads); + } + + if (config.report_type == StatisticsQueryType::DETAILED) { + if (stages) stages->print_stages(); + co_return co_await run_detailed_stats(ctx, &config, &files); + } + + std::unique_ptr agg_ptr = + std::move(direct_root_aggregate); + if (!agg_ptr) { + ScopedTimer _ag(stages, "aggregate_stats"); + auto agg_val = co_await run_aggregate_stats( + ctx, &config, + std::make_unique(std::move(partition))); + agg_ptr = std::make_unique(std::move(agg_val)); + } + + if (stages) stages->print_stages(); + co_return co_await output_aggregate_stats(&config, &files, + std::move(agg_ptr), overall); +} + int main(int argc, char** argv) { DFTRACER_UTILS_LOGGER_INIT(); + struct RocksDbExitGuard { + ~RocksDbExitGuard() { + dftracer::utils::rocksdb::mark_process_exiting_for_rocksdb(); + } + } rocksdb_exit_guard; + argparse::ArgumentParser program("dftracer_stats", DFTRACER_UTILS_PACKAGE_VERSION); program.add_description( @@ -1374,77 +1732,20 @@ int main(int argc, char** argv) { ".dftindex databases. Auto-builds indexes if missing. " "Zero-cost reads from RocksDB metadata, no decompression."); - program.add_argument("--files") - .help("Trace files to inspect (.pfw, .pfw.gz)") - .nargs(argparse::nargs_pattern::any) - .default_value>({}); - - program.add_argument("-d", "--directory") - .help("Directory containing trace files") - .default_value(""); - - program.add_argument("--index-dir") - .help("Directory where .dftindex stores are created") - .default_value(""); - - program.add_argument("--json").help("Output in JSON format").flag(); - - program.add_argument("--report") - .help( - "Report type: summary, categories, names, pid_tids, time_range, " - "duration, top-names, top-categories, detailed") - .default_value("summary"); - - program.add_argument("--top-n") - .help( - "Number of results for top-N queries (0 = show all, " - "default: 0)") - .scan<'d', std::uint64_t>() - .default_value(static_cast(0)); - - program.add_argument("--top-n-pid-tid") - .help("Max PID:TID pairs to display (0 = show all, default: 10)") - .scan<'d', std::uint64_t>() - .default_value(static_cast(10)); - - program.add_argument("--no-auto-index") - .help("Disable automatic index building for files missing .dftindex") - .flag(); - - program.add_argument("--checkpoint-size") - .help("Checkpoint size for auto-indexing in bytes (default: " + - std::to_string( - indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE) + - ")") - .scan<'d', std::size_t>() - .default_value(static_cast( - indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE)); - - program.add_argument("--executor-threads") - .help("Number of worker threads for auto-indexing") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - program.add_argument("--query") - .help("Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')") - .default_value(""); - - program.add_argument("--group-by") - .help( - "Group detailed statistics by dimension(s): name, cat, pid, " - "tid, fhash, hhash, pid_tid. Multiple values create composite " - "keys.") - .nargs(argparse::nargs_pattern::at_least_one) - .default_value>({}); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what()); - std::cerr << program; - return 1; - } - - return run_stats(program).get(); + StatsArgParse args(program); + args.setup(); + if (!args.parse(argc, argv)) return 1; + + auto pipeline_config = + cli::build_pipeline_config("DFTracer Stats Main", args.pipeline); + Pipeline pipeline(pipeline_config); + auto stats_task = make_task( + [&args](CoroScope& ctx) -> coro::CoroTask { + co_return co_await run_stats(ctx, &args); + }, + "StatsMain"); + pipeline.set_source(stats_task); + pipeline.set_destination(stats_task); + pipeline.execute(); + return stats_task->get(); } diff --git a/src/dftracer/utils/binaries/dftracer_view.cpp b/src/dftracer/utils/binaries/dftracer_view.cpp index 49fcb15f..11a303ce 100644 --- a/src/dftracer/utils/binaries/dftracer_view.cpp +++ b/src/dftracer/utils/binaries/dftracer_view.cpp @@ -1,10 +1,7 @@ #include -#include #include -#include #include #include -#include #include #include #include @@ -18,24 +15,121 @@ #include #include -#include #include #include #include #include -#include #include #include -#include #include +#include "common_cli.h" + using namespace dftracer::utils; using namespace dftracer::utils::utilities; using namespace dftracer::utils::utilities::composites::dft; using namespace dftracer::utils::utilities::composites::dft::views; using namespace dftracer::utils::utilities::filesystem; -using dftracer::utils::utilities::indexer::IndexBuildConfig; -using dftracer::utils::utilities::indexer::IndexBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBuildBatchConfig; + +class ViewArgParse : public cli::ArgParse { + public: + cli::DirectoryArgs directory{cli::DirMode::DEFAULT_EMPTY}; + cli::FilesArgs files_args; + cli::PipelineArgs pipeline; + cli::IndexingArgs indexing; + cli::QueryArgs query_args; + + std::string preset; + std::string recipe; + std::string save_recipe; + std::string time_range; + double min_duration = 0.0; + double max_duration = 0.0; + std::string output; + bool stream = false; + bool no_metadata = false; + bool no_auto_index = false; + + explicit ViewArgParse(argparse::ArgumentParser& p) : ArgParse(p) { + indexing.with_force = false; + indexing.index_dir_help = + "Directory where .dftindex stores are created"; + schema(directory, files_args, pipeline, indexing, query_args); + } + + protected: + void register_args() override { + parser() + .add_argument("--preset") + .help("Predefined view: io, compute, dlio") + .default_value(""); + + parser() + .add_argument("--recipe") + .help("Custom view JSON file path") + .default_value(""); + + parser() + .add_argument("--save-recipe") + .help("Save the constructed view to a JSON file") + .default_value(""); + + parser() + .add_argument("--time-range") + .help( + "Timestamp filter as min,max in microseconds (e.g., " + "1000000,2000000)") + .default_value(""); + + parser() + .add_argument("--min-duration") + .help("Minimum event duration in microseconds") + .scan<'g', double>() + .default_value(static_cast(0.0)); + + parser() + .add_argument("--max-duration") + .help("Maximum event duration in microseconds") + .scan<'g', double>() + .default_value(static_cast(0.0)); + + parser() + .add_argument("-o", "--output") + .help("Output file path (default: stdout)") + .default_value(""); + + parser() + .add_argument("--stream") + .help("Stream matching events to stdout as NDJSON") + .flag(); + + parser() + .add_argument("--no-metadata") + .help("Exclude metadata events (ph=M) from output") + .flag(); + + parser() + .add_argument("--no-auto-index") + .help( + "Disable automatic index building for files missing .dftindex") + .flag(); + } + + void post_parse() override { + preset = parser().get("--preset"); + recipe = parser().get("--recipe"); + save_recipe = parser().get("--save-recipe"); + time_range = parser().get("--time-range"); + min_duration = parser().get("--min-duration"); + max_duration = parser().get("--max-duration"); + output = parser().get("--output"); + stream = parser().get("--stream"); + no_metadata = parser().get("--no-metadata"); + no_auto_index = parser().get("--no-auto-index"); + } +}; struct ViewContext { std::string index_dir; @@ -53,24 +147,32 @@ struct ViewContext { std::atomic* failed_count; }; -static coro::CoroTask index_single_file(const std::string& file_path, - const ViewContext& vctx, - CoroScope&) { - IndexBuilderUtility builder; - auto config = IndexBuildConfig::for_file(file_path) - .with_index_dir(vctx.index_dir) - .with_checkpoint_size(vctx.checkpoint_size) - .with_bloom(true) - .with_index_threshold(0); - auto result = co_await builder.process(config); - - if (result.success) { - (*vctx.indexed_count)++; - } else { - (*vctx.failed_count)++; - DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s", - file_path.c_str(), - result.error_message.c_str()); +static coro::CoroTask batch_index_files( + const std::vector& files_needing_index, + const ViewContext& vctx, CoroScope& ctx) { + auto batch_config = std::make_shared(); + batch_config->file_paths = files_needing_index; + batch_config->index_dir = vctx.index_dir; + batch_config->checkpoint_size = vctx.checkpoint_size; + batch_config->parallelism = + std::max(1, files_needing_index.size()); + batch_config->use_batch_write = true; + batch_config->rebuild_root_summaries = true; + + auto batch_result = co_await IndexBatchBuilderUtility::process( + &ctx, std::move(batch_config)); + + for (const auto& result : batch_result.results) { + if (result.success) { + (*vctx.indexed_count)++; + } else { + (*vctx.failed_count)++; + if (!result.error_message.empty()) { + DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s", + result.file_path.c_str(), + result.error_message.c_str()); + } + } } } @@ -104,7 +206,6 @@ static coro::CoroTask read_single_chunk( } } } else { - // Non-stream: must copy since string_view won't outlive chunk std::lock_guard lock(*vctx.output_mutex); for (const auto& event : batch->events) { vctx.all_events->emplace_back(event); @@ -120,7 +221,6 @@ static coro::CoroTask process_single_file(const std::string& file_path, std::string index_path = internal::determine_index_path(file_path, vctx.index_dir); - // Collect metadata auto meta_input = MetadataCollectorUtilityInput::from_file(file_path) .with_checkpoint_size(vctx.checkpoint_size) .with_force_rebuild(false) @@ -134,7 +234,6 @@ static coro::CoroTask process_single_file(const std::string& file_path, co_return; } - // Run ViewBuilderUtility to get candidate chunks ViewBuilderInput builder_input; builder_input.with_view(vctx.view) .with_file_path(file_path) @@ -157,7 +256,6 @@ static coro::CoroTask process_single_file(const std::string& file_path, co_return; } - // Process each candidate chunk auto& candidates = build_output.candidates; co_await fctx.scope([&file_path, &index_path, &vctx, &candidates]( CoroScope& chunk_scope) -> coro::CoroTask { @@ -172,23 +270,21 @@ static coro::CoroTask process_single_file(const std::string& file_path, }); } -static coro::CoroTask run_view(argparse::ArgumentParser& program) { - std::string directory = program.get("--directory"); - std::string index_dir = program.get("--index-dir"); - std::string preset = program.get("--preset"); - std::string recipe_path = program.get("--recipe"); - std::string save_recipe = program.get("--save-recipe"); - std::string output_path = program.get("--output"); - std::string time_range_str = program.get("--time-range"); - double min_duration = program.get("--min-duration"); - double max_duration = program.get("--max-duration"); - bool stream_mode = program.get("--stream"); - bool no_metadata = program.get("--no-metadata"); - bool no_auto_index = program.get("--no-auto-index"); - std::size_t checkpoint_size = program.get("--checkpoint-size"); - std::size_t executor_threads = - program.get("--executor-threads"); - auto query_str = program.get("--query"); +static coro::CoroTask run_view(const ViewArgParse* cli) { + const auto& directory = cli->directory.value; + const auto& index_dir = cli->indexing.index_dir; + const auto& preset = cli->preset; + const auto& recipe_path = cli->recipe; + const auto& save_recipe = cli->save_recipe; + const auto& output_path = cli->output; + const auto& time_range_str = cli->time_range; + const auto min_duration = cli->min_duration; + const auto max_duration = cli->max_duration; + const auto stream_mode = cli->stream; + const auto no_metadata = cli->no_metadata; + const auto no_auto_index = cli->no_auto_index; + const auto checkpoint_size = cli->indexing.checkpoint_size; + const auto& query_str = cli->query_args.query; ViewDefinition view; @@ -285,7 +381,6 @@ static coro::CoroTask run_view(argparse::ArgumentParser& program) { if (!view.query) { DFTRACER_UTILS_LOG_ERROR( "%s", "No view specified. Use --preset, --recipe, or --query."); - std::cerr << program; co_return 1; } @@ -319,12 +414,11 @@ static coro::CoroTask run_view(argparse::ArgumentParser& program) { co_return 1; } } else { - files = program.get>("--files"); + files = cli->files_args.value; if (files.empty()) { DFTRACER_UTILS_LOG_ERROR( "%s", "No files or directory specified. Use --help for usage."); - std::cerr << program; co_return 1; } } @@ -388,10 +482,8 @@ static coro::CoroTask run_view(argparse::ArgumentParser& program) { &indexed_count, &failed_count}; - auto pipeline_config = PipelineConfig() - .with_name("DFTracer View") - .with_compute_threads(executor_threads) - .with_watchdog(false); + auto pipeline_config = + cli::build_pipeline_config("DFTracer View", cli->pipeline); Pipeline pipeline(pipeline_config); @@ -402,19 +494,7 @@ static coro::CoroTask run_view(argparse::ArgumentParser& program) { [files_needing_index_ptr, files_ptr, &vctx](CoroScope& ctx) -> coro::CoroTask { if (!files_needing_index_ptr->empty()) { - co_await ctx.scope([files_needing_index_ptr, - &vctx](CoroScope& scope) - -> coro::CoroTask { - for (std::size_t i = 0; i < files_needing_index_ptr->size(); - ++i) { - const auto file_path = (*files_needing_index_ptr)[i]; - scope.spawn([file_path, &vctx](CoroScope& fctx) - -> coro::CoroTask { - co_await index_single_file(file_path, vctx, fctx); - }); - } - co_return; - }); + co_await batch_index_files(*files_needing_index_ptr, vctx, ctx); std::printf("Auto-indexing complete: %zu indexed, %zu failed\n", vctx.indexed_count->load(), @@ -481,97 +561,12 @@ int main(int argc, char** argv) { "indices for efficient chunk-skipping. Supports predefined views " "(io, compute, dlio), custom recipes, and inline queries."); - // Input files - program.add_argument("--files") - .help("Trace files to process (.pfw, .pfw.gz)") - .nargs(argparse::nargs_pattern::any) - .default_value>({}); - - program.add_argument("-d", "--directory") - .help("Directory containing trace files") - .default_value(""); - - // View specification - program.add_argument("--preset") - .help("Predefined view: io, compute, dlio") - .default_value(""); - - program.add_argument("--recipe") - .help("Custom view JSON file path") - .default_value(""); - - program.add_argument("--save-recipe") - .help("Save the constructed view to a JSON file") - .default_value(""); - - program.add_argument("--query") - .help("Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')") - .default_value(""); - - // Event-level filters - program.add_argument("--time-range") - .help( - "Timestamp filter as min,max in microseconds (e.g., " - "1000000,2000000)") - .default_value(""); - - program.add_argument("--min-duration") - .help("Minimum event duration in microseconds") - .scan<'g', double>() - .default_value(static_cast(0.0)); - - program.add_argument("--max-duration") - .help("Maximum event duration in microseconds") - .scan<'g', double>() - .default_value(static_cast(0.0)); - - // Output - program.add_argument("-o", "--output") - .help("Output file path (default: stdout)") - .default_value(""); - - program.add_argument("--stream") - .help("Stream matching events to stdout as NDJSON") - .flag(); - - program.add_argument("--no-metadata") - .help("Exclude metadata events (ph=M) from output") - .flag(); - - // Indexing options - program.add_argument("--index-dir") - .help("Directory where .dftindex stores are created") - .default_value(""); - - program.add_argument("--no-auto-index") - .help("Disable automatic index building for files missing .dftindex") - .flag(); - - program.add_argument("--checkpoint-size") - .help("Checkpoint size for auto-indexing in bytes (default: " + - std::to_string( - indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE) + - ")") - .scan<'d', std::size_t>() - .default_value(static_cast( - indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE)); - - program.add_argument("--executor-threads") - .help("Number of worker threads") - .scan<'d', std::size_t>() - .default_value( - static_cast(dftracer_utils_hardware_concurrency())); - - try { - program.parse_args(argc, argv); - } catch (const std::exception& err) { - DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what()); - std::cerr << program; - return 1; - } + ViewArgParse cli(program); + cli.setup(); + if (!cli.parse(argc, argv)) return 1; try { - return run_view(program).get(); + return run_view(&cli).get(); } catch (const std::exception& e) { DFTRACER_UTILS_LOG_ERROR("Fatal: %s", e.what()); return 1; diff --git a/src/dftracer/utils/core/common/inflater.h b/src/dftracer/utils/core/common/inflater.h index 73449e36..f966ac8f 100644 --- a/src/dftracer/utils/core/common/inflater.h +++ b/src/dftracer/utils/core/common/inflater.h @@ -22,8 +22,13 @@ class Inflater { constants::indexer::INFLATE_BUFFER_SIZE; z_stream stream; - alignas(DFTRACER_OPTIMAL_ALIGNMENT) unsigned char out_buffer[BUFFER_SIZE]; - alignas(DFTRACER_OPTIMAL_ALIGNMENT) unsigned char in_buffer[BUFFER_SIZE]; + alignas(DFTRACER_OPTIMAL_ALIGNMENT) unsigned char out_buffer_[BUFFER_SIZE]; + alignas(DFTRACER_OPTIMAL_ALIGNMENT) unsigned char in_buffer_[BUFFER_SIZE]; + + unsigned char* out_buffer() { return out_buffer_; } + const unsigned char* out_buffer() const { return out_buffer_; } + unsigned char* in_buffer() { return in_buffer_; } + const unsigned char* in_buffer() const { return in_buffer_; } protected: int window_bits_; @@ -31,8 +36,8 @@ class Inflater { public: Inflater() : window_bits_(constants::indexer::ZLIB_GZIP_WINDOW_BITS) { std::memset(&stream, 0, sizeof(stream)); - std::memset(out_buffer, 0, sizeof(out_buffer)); - std::memset(in_buffer, 0, sizeof(in_buffer)); + std::memset(out_buffer_, 0, BUFFER_SIZE); + std::memset(in_buffer_, 0, BUFFER_SIZE); } virtual ~Inflater() { inflateEnd(&stream); } @@ -88,11 +93,10 @@ class Inflater { } coro::CoroTask read_input(int fd, off_t& offset) { - ssize_t n = - co_await io::pread(fd, in_buffer, sizeof(in_buffer), offset); + ssize_t n = co_await io::pread(fd, in_buffer(), BUFFER_SIZE, offset); if (n > 0) { offset += n; - stream.next_in = in_buffer; + stream.next_in = in_buffer(); stream.avail_in = static_cast(n); co_return true; } else if (n < 0) { @@ -104,13 +108,14 @@ class Inflater { } std::size_t get_output(unsigned char* buf, std::size_t len) { - std::size_t available = sizeof(out_buffer) - stream.avail_out; + std::size_t available = BUFFER_SIZE - stream.avail_out; std::size_t to_copy = std::min(len, available); - std::memcpy(buf, out_buffer, to_copy); + std::memcpy(buf, out_buffer(), to_copy); // Shift remaining data if (to_copy < available) { - std::memmove(out_buffer, out_buffer + to_copy, available - to_copy); + std::memmove(out_buffer(), out_buffer() + to_copy, + available - to_copy); } return to_copy; @@ -129,8 +134,8 @@ class Inflater { return NEED_INPUT; } - stream.next_out = out_buffer; - stream.avail_out = sizeof(out_buffer); + stream.next_out = out_buffer(); + stream.avail_out = BUFFER_SIZE; int ret = inflate(&stream, flush_mode); @@ -150,7 +155,7 @@ class Inflater { } bool needs_input() const { return stream.avail_in == 0; } - bool has_output() const { return stream.avail_out < sizeof(out_buffer); } + bool has_output() const { return stream.avail_out < BUFFER_SIZE; } int get_data_type() const { return stream.data_type; } std::size_t get_avail_in() const { return stream.avail_in; } std::size_t get_avail_out() const { return stream.avail_out; } diff --git a/src/dftracer/utils/core/common/memory_budget.cpp b/src/dftracer/utils/core/common/memory_budget.cpp new file mode 100644 index 00000000..8dc778af --- /dev/null +++ b/src/dftracer/utils/core/common/memory_budget.cpp @@ -0,0 +1,206 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils { + +static constexpr std::size_t FALLBACK_AVAILABLE_BYTES = + 1ULL * 1024 * 1024 * 1024; + +static std::size_t read_size_from_file(const char *path) { + FILE *f = std::fopen(path, "r"); + if (!f) return 0; + char buf[64]; + std::size_t n = std::fread(buf, 1, sizeof(buf) - 1, f); + std::fclose(f); + if (n == 0) return 0; + buf[n] = '\0'; + if (std::strncmp(buf, "max", 3) == 0) return 0; + char *end = nullptr; + unsigned long long val = std::strtoull(buf, &end, 10); + if (end == buf) return 0; + return static_cast(val); +} + +static constexpr std::size_t CGROUP_LIMIT_SENTINEL = + 1ULL * 1024 * 1024 * 1024 * 1024; + +static void read_self_cgroup_paths(std::string &v2_path, std::string &v1_path) { + FILE *f = std::fopen("/proc/self/cgroup", "r"); + if (!f) return; + char line[1024]; + while (std::fgets(line, sizeof(line), f)) { + std::size_t n = std::strlen(line); + while (n > 0 && (line[n - 1] == '\n' || line[n - 1] == '\r')) { + line[--n] = '\0'; + } + if (n >= 3 && line[0] == '0' && line[1] == ':' && line[2] == ':') { + v2_path = line + 3; + continue; + } + char *first = std::strchr(line, ':'); + if (!first) continue; + char *second = std::strchr(first + 1, ':'); + if (!second) continue; + std::string controllers(first + 1, second - first - 1); + std::size_t start = 0; + while (start <= controllers.size()) { + std::size_t comma = controllers.find(',', start); + std::size_t end = + (comma == std::string::npos) ? controllers.size() : comma; + if (controllers.compare(start, end - start, "memory") == 0) { + v1_path = second + 1; + break; + } + if (comma == std::string::npos) break; + start = comma + 1; + } + } + std::fclose(f); +} + +static std::size_t cgroup_v2_limit_at(const std::string &cg_path) { + std::string base = "/sys/fs/cgroup" + cg_path; + std::string dir = base; + while (true) { + std::size_t max_mem = + read_size_from_file((dir + "/memory.max").c_str()); + if (max_mem > 0 && max_mem < CGROUP_LIMIT_SENTINEL) { + std::size_t current = + read_size_from_file((dir + "/memory.current").c_str()); + if (current >= max_mem) return 0; + return max_mem - current; + } + if (dir.size() <= std::strlen("/sys/fs/cgroup")) break; + std::size_t slash = dir.find_last_of('/'); + if (slash == std::string::npos || slash < std::strlen("/sys/fs/cgroup")) + break; + dir.resize(slash); + } + return 0; +} + +static std::size_t cgroup_v1_limit_at(const std::string &cg_path) { + std::string base = "/sys/fs/cgroup/memory" + cg_path; + std::string dir = base; + while (true) { + std::size_t limit = + read_size_from_file((dir + "/memory.limit_in_bytes").c_str()); + if (limit > 0 && limit < CGROUP_LIMIT_SENTINEL) { + std::size_t usage = + read_size_from_file((dir + "/memory.usage_in_bytes").c_str()); + if (usage >= limit) return 0; + return limit - usage; + } + if (dir.size() <= std::strlen("/sys/fs/cgroup/memory")) break; + std::size_t slash = dir.find_last_of('/'); + if (slash == std::string::npos || + slash < std::strlen("/sys/fs/cgroup/memory")) + break; + dir.resize(slash); + } + return 0; +} + +static std::size_t try_cgroups_v2() { + std::string v2_path, v1_path; + read_self_cgroup_paths(v2_path, v1_path); + if (v2_path.empty()) v2_path = "/"; + return cgroup_v2_limit_at(v2_path); +} + +static std::size_t try_cgroups_v1() { + std::string v2_path, v1_path; + read_self_cgroup_paths(v2_path, v1_path); + if (v1_path.empty()) v1_path = "/"; + return cgroup_v1_limit_at(v1_path); +} + +static std::size_t try_proc_meminfo() { + FILE *f = std::fopen("/proc/meminfo", "r"); + if (!f) return 0; + char line[256]; + while (std::fgets(line, sizeof(line), f)) { + if (std::strncmp(line, "MemAvailable:", 13) == 0) { + char *p = line + 13; + while (*p == ' ') ++p; + char *end = nullptr; + unsigned long long val = std::strtoull(p, &end, 10); + std::fclose(f); + return static_cast(val) * 1024; + } + } + std::fclose(f); + return 0; +} + +std::size_t detect_available_memory() { + std::size_t avail = try_cgroups_v2(); + if (avail > 0) return avail; + avail = try_cgroups_v1(); + if (avail > 0) return avail; + avail = try_proc_meminfo(); + if (avail > 0) return avail; + return FALLBACK_AVAILABLE_BYTES; +} + +std::size_t compute_memory_budget(std::size_t user_override_bytes) { + if (user_override_bytes > 0) return user_override_bytes; + std::size_t avail = detect_available_memory(); + std::size_t budget = avail * DEFAULT_MEMORY_BUDGET_FRACTION_PERCENT / 100; + return std::max(budget, MIN_MEMORY_BUDGET_BYTES); +} + +std::size_t compute_channel_capacity(std::size_t memory_budget_bytes, + std::size_t estimated_batch_bytes, + std::size_t num_workers) { + std::size_t from_budget = + memory_budget_bytes / std::max(estimated_batch_bytes, std::size_t(1)); + std::size_t minimum = std::max(num_workers * 2, std::size_t(4)); + return std::max(from_budget, minimum); +} + +std::size_t compute_file_batch_size(std::size_t memory_budget_bytes, + std::size_t estimated_file_bytes, + std::size_t min_files) { + std::size_t from_budget = + memory_budget_bytes / std::max(estimated_file_bytes, std::size_t(1)); + return std::max(from_budget, std::max(min_files, std::size_t(1))); +} + +std::size_t estimate_per_file_bytes(const std::vector &file_sizes, + std::size_t user_override_bytes) { + if (user_override_bytes > 0) return user_override_bytes; + if (file_sizes.empty()) return MIN_PER_FILE_PEAK_BYTES; + + const std::size_t total = file_sizes.size(); + const std::size_t sample_count = std::min(total, PER_FILE_SAMPLE_LIMIT); + const std::size_t stride = std::max(total / sample_count, std::size_t(1)); + + std::vector sizes; + sizes.reserve(sample_count); + for (std::size_t i = 0; i < total && sizes.size() < sample_count; + i += stride) { + if (file_sizes[i] > 0) sizes.push_back(file_sizes[i]); + } + + if (sizes.empty()) return MIN_PER_FILE_PEAK_BYTES; + + std::size_t idx = (sizes.size() * 95) / 100; + if (idx >= sizes.size()) idx = sizes.size() - 1; + std::nth_element(sizes.begin(), sizes.begin() + idx, sizes.end()); + const std::size_t p95 = sizes[idx]; + + std::size_t estimate = p95 * PER_FILE_EXPANSION_FACTOR; + estimate = std::max(estimate, MIN_PER_FILE_PEAK_BYTES); + estimate = std::min(estimate, MAX_PER_FILE_PEAK_BYTES); + return estimate; +} + +} // namespace dftracer::utils diff --git a/src/dftracer/utils/core/io/io_backend_factory.cpp b/src/dftracer/utils/core/io/io_backend_factory.cpp index 01f5207b..6d17f3b1 100644 --- a/src/dftracer/utils/core/io/io_backend_factory.cpp +++ b/src/dftracer/utils/core/io/io_backend_factory.cpp @@ -21,17 +21,16 @@ std::unique_ptr create_io_backend(Executor& executor, unsigned batch_threshold) { // Explicit backend selection (non-AUTO). if (backend_type == IoBackendType::THREADPOOL) { - DFTRACER_UTILS_LOG_DEBUG( - "I/O backend: using threadpool (%zu threads, forced)", pool_size); + DFTRACER_UTILS_LOG_INFO("I/O backend: using threadpool (%zu threads)", + pool_size); return std::make_unique(executor, pool_size, batch_threshold); } #ifdef __linux__ if (backend_type == IoBackendType::EPOLL_THREADPOOL) { - DFTRACER_UTILS_LOG_DEBUG( - "I/O backend: using epoll+threadpool (%zu threads, forced)", - pool_size); + DFTRACER_UTILS_LOG_INFO( + "I/O backend: using epoll+threadpool (%zu threads)", pool_size); return std::make_unique(executor, pool_size, batch_threshold); } @@ -40,9 +39,8 @@ std::unique_ptr create_io_backend(Executor& executor, #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ defined(__NetBSD__) || defined(__DragonFly__) if (backend_type == IoBackendType::KQUEUE_THREADPOOL) { - DFTRACER_UTILS_LOG_DEBUG( - "I/O backend: using kqueue+threadpool (%zu threads, forced)", - pool_size); + DFTRACER_UTILS_LOG_INFO( + "I/O backend: using kqueue+threadpool (%zu threads)", pool_size); return std::make_unique(executor, pool_size, batch_threshold); } @@ -53,12 +51,11 @@ std::unique_ptr create_io_backend(Executor& executor, auto uring = std::make_unique(executor, 256, batch_threshold); if (uring->probe()) { - DFTRACER_UTILS_LOG_DEBUG("%s", - "I/O backend: using io_uring (forced)"); + DFTRACER_UTILS_LOG_INFO("%s", "I/O backend: using io_uring"); return uring; } DFTRACER_UTILS_LOG_ERROR("%s", - "io_uring forced but runtime probe failed"); + "io_uring selected but runtime probe failed"); // Fall through to AUTO detection. } #endif @@ -69,28 +66,28 @@ std::unique_ptr create_io_backend(Executor& executor, auto uring = std::make_unique(executor, 256, batch_threshold); if (uring->probe()) { - DFTRACER_UTILS_LOG_DEBUG("%s", "I/O backend: using io_uring"); + DFTRACER_UTILS_LOG_INFO("%s", "I/O backend: using io_uring"); return uring; } - DFTRACER_UTILS_LOG_DEBUG("%s", - "io_uring runtime probe failed, falling back"); + DFTRACER_UTILS_LOG_INFO("%s", + "io_uring runtime probe failed, falling back"); } #endif #ifdef __linux__ - DFTRACER_UTILS_LOG_DEBUG( - "I/O backend: using epoll+threadpool (%zu threads)", pool_size); + DFTRACER_UTILS_LOG_INFO("I/O backend: using epoll+threadpool (%zu threads)", + pool_size); return std::make_unique(executor, pool_size, batch_threshold); #elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ defined(__NetBSD__) || defined(__DragonFly__) - DFTRACER_UTILS_LOG_DEBUG( + DFTRACER_UTILS_LOG_INFO( "I/O backend: using kqueue+threadpool (%zu threads)", pool_size); return std::make_unique(executor, pool_size, batch_threshold); #else - DFTRACER_UTILS_LOG_DEBUG("I/O backend: using threadpool (%zu threads)", - pool_size); + DFTRACER_UTILS_LOG_INFO("I/O backend: using threadpool (%zu threads)", + pool_size); return std::make_unique(executor, pool_size, batch_threshold); #endif diff --git a/src/dftracer/utils/core/pipeline/executor.cpp b/src/dftracer/utils/core/pipeline/executor.cpp index 12029f51..704ddd44 100644 --- a/src/dftracer/utils/core/pipeline/executor.cpp +++ b/src/dftracer/utils/core/pipeline/executor.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include @@ -32,8 +31,6 @@ Executor* Executor::set_current(Executor* e) noexcept { return old; } -io::IoThreadPool* Executor::db_pool() noexcept { return db_pool_.get(); } - // Thread-local list of coroutine handles to destroy after the current // resume() returns. FinalAwaiter pushes here instead of the shared // destroy_queue_ to avoid another worker freeing the frame while @@ -61,15 +58,20 @@ Executor::Executor(const ExecutorConfig& config) : num_threads_(config.num_threads == 0 ? dftracer_utils_hardware_concurrency() : config.num_threads), - last_activity_time_(std::chrono::steady_clock::now()), + last_activity_ns_( + std::chrono::steady_clock::now().time_since_epoch().count()), idle_timeout_(config.idle_timeout), deadlock_timeout_(config.deadlock_timeout), - io_pool_size_(config.io_pool_size), + io_pool_size_(config.io_pool_size == 0 + ? dftracer_utils_hardware_concurrency() + : config.io_pool_size), io_backend_type_(config.io_backend_type), - io_batch_threshold_(config.io_batch_threshold), - db_pool_size_(config.db_pool_size) { + io_batch_threshold_(config.io_batch_threshold) { if (num_threads_ == 0) { - num_threads_ = 2; // Fallback if hardware_concurrency returns 0 + num_threads_ = 2; + } + if (io_pool_size_ == 0) { + io_pool_size_ = 2; } DFTRACER_UTILS_LOG_DEBUG( "Executor created with %zu threads, idle_timeout=%lld s, " @@ -100,9 +102,6 @@ void Executor::start() { io_batch_threshold_); io_backend_->start(); - db_pool_ = std::make_unique(db_pool_size_); - db_pool_->start(); - // Create all worker contexts first so workers_ is stable before any // worker thread can try to iterate/steal from it. for (std::size_t i = 0; i < num_threads_; ++i) { @@ -143,11 +142,6 @@ void Executor::shutdown() { // completion thread may still call enqueue() -> wake_all_workers() // which accesses WorkerContext cv/mutex, so workers_ must remain // alive until the completion thread has exited. - if (db_pool_) { - db_pool_->stop(); - db_pool_.reset(); - } - if (io_backend_) { io_backend_->stop(); io_backend_.reset(); @@ -258,12 +252,7 @@ void Executor::worker_thread(WorkerContext* context) { } } drain_destroy_queue(); - std::unique_lock lock(context->queue_mutex); - context->cv.wait(lock, [this, observed_signal] { - return !running_.load(std::memory_order_acquire) || - work_signal_.load(std::memory_order_acquire) != - observed_signal; - }); + work_signal_.wait(observed_signal, std::memory_order_acquire); } } @@ -308,33 +297,11 @@ void Executor::signal_global_work() { wake_one_worker(); } -void Executor::wake_one_worker() { - const std::size_t worker_count = workers_.size(); - if (worker_count == 0) { - return; - } - - const std::size_t worker_index = - next_worker_.fetch_add(1, std::memory_order_relaxed) % worker_count; - // Lock-then-unlock the worker's mutex before notifying. - // This ensures the worker is either before its predicate check (and will - // see the updated atomic state) or inside cv.wait (and will receive the - // notification). Without this, a notification sent between predicate - // evaluation and cv.wait entry is lost, causing the worker to hang. - workers_[worker_index]->queue_mutex.lock(); - workers_[worker_index]->queue_mutex.unlock(); - workers_[worker_index]->cv.notify_one(); -} +void Executor::wake_one_worker() { work_signal_.notify_one(); } void Executor::wake_all_workers() { - for (auto& worker : workers_) { - // Lock-then-unlock ensures the worker is either before its predicate - // check or inside cv.wait before the notification is sent. - // See wake_one_worker() for detailed rationale. - worker->queue_mutex.lock(); - worker->queue_mutex.unlock(); - worker->cv.notify_all(); - } + work_signal_.fetch_add(1, std::memory_order_release); + work_signal_.notify_all(); } // Helper function for when_all.h (avoids circular dependency) @@ -382,9 +349,11 @@ bool Executor::is_responsive() const { if (active >= num_threads_) { // All threads busy - check if making progress - std::lock_guard lock(activity_mutex_); auto now = std::chrono::steady_clock::now(); - auto idle_time = now - last_activity_time_; + auto last_ns = last_activity_ns_.load(std::memory_order_acquire); + auto last_tp = std::chrono::steady_clock::time_point( + std::chrono::steady_clock::duration(last_ns)); + auto idle_time = now - last_tp; // If all threads busy but no activity for deadlock_timeout, // likely deadlocked @@ -403,8 +372,9 @@ bool Executor::is_responsive() const { } void Executor::mark_activity() { - std::lock_guard lock(activity_mutex_); - last_activity_time_ = std::chrono::steady_clock::now(); + last_activity_ns_.store( + std::chrono::steady_clock::now().time_since_epoch().count(), + std::memory_order_release); } void Executor::update_task_location(TaskIndex task_id, diff --git a/src/dftracer/utils/core/pipeline/pipeline.cpp b/src/dftracer/utils/core/pipeline/pipeline.cpp index b546974d..86b859a5 100644 --- a/src/dftracer/utils/core/pipeline/pipeline.cpp +++ b/src/dftracer/utils/core/pipeline/pipeline.cpp @@ -5,7 +5,6 @@ #include #include -#include namespace dftracer::utils { @@ -20,7 +19,6 @@ Pipeline::Pipeline(const PipelineConfig& config) exec_cfg.io_pool_size = config.io_thread_count; exec_cfg.io_backend_type = config.io_backend_type; exec_cfg.io_batch_threshold = config.io_batch_threshold; - exec_cfg.db_pool_size = config.db_pool_size; std::unique_ptr watchdog; if (config.enable_watchdog) { diff --git a/src/dftracer/utils/core/rocksdb/async.cpp b/src/dftracer/utils/core/rocksdb/async.cpp deleted file mode 100644 index 6d3b016b..00000000 --- a/src/dftracer/utils/core/rocksdb/async.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::rocksdb { - -io::IoThreadPool* get_db_pool() { - auto* exec = Executor::current(); - if (exec == nullptr) { - return nullptr; - } - return exec->db_pool(); -} - -void db_async_submit(io::IoThreadPool* pool, std::function fn) { - pool->submit(std::move(fn)); -} - -void db_async_resume_on(void* executor, std::coroutine_handle<> h) { - auto* exec = static_cast(executor); - if (exec != nullptr) { - exec->enqueue(h); - } else { - h.resume(); - } -} - -void* get_current_executor_opaque() { - return static_cast(Executor::current()); -} - -} // namespace dftracer::utils::rocksdb diff --git a/src/dftracer/utils/core/rocksdb/database.cpp b/src/dftracer/utils/core/rocksdb/database.cpp index 1b227a67..63be75a2 100644 --- a/src/dftracer/utils/core/rocksdb/database.cpp +++ b/src/dftracer/utils/core/rocksdb/database.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -25,7 +26,11 @@ const ::rocksdb::ReadOptions& read_options() { } const ::rocksdb::WriteOptions& write_options() { - static const ::rocksdb::WriteOptions options; + static const auto options = [] { + ::rocksdb::WriteOptions wo; + wo.disableWAL = true; + return wo; + }(); return options; } @@ -79,10 +84,8 @@ RocksDatabase& RocksDatabase::operator=(RocksDatabase&& other) noexcept { return *this; } -std::vector RocksDatabase::default_column_families() { - return {"default", "checkpoints", "metadata", "chunk_bloom", - "file_bloom", "chunk_stats", "dimensions", "chunk_dim_stats", - "manifest", "provenance", "archives", "tar_files"}; +const decltype(cf::ALL)& RocksDatabase::default_column_families() { + return cf::ALL; } ::rocksdb::Options RocksDatabase::default_options() { @@ -92,13 +95,40 @@ ::rocksdb::Options RocksDatabase::default_options() { options.allow_concurrent_memtable_write = true; options.enable_pipelined_write = true; options.max_open_files = Env::rocksdb_max_open_files(); + options.max_background_jobs = 8; + options.max_subcompactions = 8; + options.write_buffer_size = 256 * 1024 * 1024; + options.max_write_buffer_number = 4; return options; } ::rocksdb::ColumnFamilyOptions RocksDatabase::default_column_family_options() { ::rocksdb::ColumnFamilyOptions options; + + ::rocksdb::BlockBasedTableOptions bbt; + bbt.block_size = 32 * 1024; + bbt.format_version = 5; + bbt.index_block_restart_interval = 16; + options.table_factory.reset(::rocksdb::NewBlockBasedTableFactory(bbt)); + +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + options.compression = ::rocksdb::kZSTD; + options.compression_opts.level = 9; + options.compression_opts.max_dict_bytes = 262144; + options.compression_opts.zstd_max_train_bytes = 1048576; + options.compression_opts.enabled = true; + options.bottommost_compression = ::rocksdb::kZSTD; + options.bottommost_compression_opts.level = 9; + options.bottommost_compression_opts.max_dict_bytes = 262144; + options.bottommost_compression_opts.zstd_max_train_bytes = 1048576; + options.bottommost_compression_opts.enabled = true; +#elif defined(DFTRACER_UTILS_ENABLE_LZ4) options.compression = ::rocksdb::kLZ4Compression; options.bottommost_compression = ::rocksdb::kZlibCompression; +#else + options.compression = ::rocksdb::kZlibCompression; + options.bottommost_compression = ::rocksdb::kZlibCompression; +#endif return options; } @@ -131,14 +161,17 @@ bool RocksDatabase::open(const std::string& db_path, OpenMode open_mode) { "Failed to list RocksDB column families at '" + db_path_ + "': " + list_status.ToString()); } - column_family_names = default_column_families(); + column_family_names.reserve(default_column_families().size()); + for (auto name : default_column_families()) { + column_family_names.emplace_back(name); + } } else { if (open_mode_ == OpenMode::ReadWrite) { for (const auto& name : default_column_families()) { if (std::find(column_family_names.begin(), column_family_names.end(), name) == column_family_names.end()) { - column_family_names.push_back(name); + column_family_names.emplace_back(name); } } } @@ -147,16 +180,22 @@ bool RocksDatabase::open(const std::string& db_path, OpenMode open_mode) { std::vector<::rocksdb::ColumnFamilyDescriptor> descriptors; descriptors.reserve(column_family_names.size()); for (const auto& name : column_family_names) { - descriptors.emplace_back(name, cf_options); + auto opts = cf_options; + if (cf_options_override_) { + cf_options_override_(name, opts); + } + descriptors.emplace_back(name, opts); } std::vector<::rocksdb::ColumnFamilyHandle*> handles; - auto status = - open_mode_ == OpenMode::ReadOnly - ? ::rocksdb::DB::OpenForReadOnly(db_options, db_path_, descriptors, - &handles, &db_, false) - : ::rocksdb::DB::Open(db_options, db_path_, descriptors, &handles, - &db_); + ::rocksdb::Status status; + if (open_mode_ == OpenMode::ReadOnly) { + status = ::rocksdb::DB::OpenForReadOnly( + db_options, db_path_, descriptors, &handles, &db_, false); + } else { + status = ::rocksdb::DB::Open(db_options, db_path_, descriptors, + &handles, &db_); + } if (!status.ok()) { cleanup_failed_open(db_, handles); throw std::runtime_error("Failed to open RocksDB at '" + db_path_ + @@ -215,7 +254,7 @@ ::rocksdb::DB* RocksDatabase::get() const noexcept { return db_; } ::rocksdb::ColumnFamilyHandle* RocksDatabase::column_family_handle( std::string_view column_family) const { - const auto name = column_family.empty() ? std::string("default") + const auto name = column_family.empty() ? std::string(cf::DEFAULT) : std::string(column_family); const auto it = column_families_.find(name); if (it == column_families_.end() || it->second == nullptr) { @@ -238,12 +277,42 @@ ::rocksdb::Status RocksDatabase::get(std::string_view key, std::string* value, ::rocksdb::Slice(key.data(), key.size()), value); } +::rocksdb::Status RocksDatabase::merge(std::string_view key, + std::string_view value, + std::string_view column_family) { + return db_->Merge(write_options(), column_family_handle(column_family), + ::rocksdb::Slice(key.data(), key.size()), + ::rocksdb::Slice(value.data(), value.size())); +} + +void RocksDatabase::set_cf_options_override(CfOptionsOverride override) { + cf_options_override_ = std::move(override); +} + +::rocksdb::Status RocksDatabase::merge(Batch& batch, + std::string_view column_family, + std::string_view key, + std::string_view value) { + return batch.Merge(column_family_handle(column_family), + ::rocksdb::Slice(key.data(), key.size()), + ::rocksdb::Slice(value.data(), value.size())); +} + ::rocksdb::Status RocksDatabase::del(std::string_view key, std::string_view column_family) { return db_->Delete(write_options(), column_family_handle(column_family), ::rocksdb::Slice(key.data(), key.size())); } +::rocksdb::Status RocksDatabase::delete_range(std::string_view begin_key, + std::string_view end_key, + std::string_view column_family) { + return db_->DeleteRange( + write_options(), column_family_handle(column_family), + ::rocksdb::Slice(begin_key.data(), begin_key.size()), + ::rocksdb::Slice(end_key.data(), end_key.size())); +} + ::rocksdb::Status RocksDatabase::put(Batch& batch, std::string_view column_family, std::string_view key, @@ -272,4 +341,27 @@ std::unique_ptr<::rocksdb::Iterator> RocksDatabase::new_iterator( db_->NewIterator(read_options(), column_family_handle(column_family))); } +::rocksdb::Status RocksDatabase::compact(std::string_view column_family) { + ::rocksdb::CompactRangeOptions opts; + opts.max_subcompactions = 8; + return db_->CompactRange(opts, column_family_handle(column_family), nullptr, + nullptr); +} + +::rocksdb::Status RocksDatabase::ingest_external_files( + std::string_view column_family, + const std::vector& external_files, bool ingest_behind) { + if (external_files.empty()) { + return ::rocksdb::Status::OK(); + } + ::rocksdb::IngestExternalFileOptions opts; + opts.move_files = false; + opts.snapshot_consistency = true; + opts.allow_global_seqno = true; + opts.allow_blocking_flush = true; + opts.ingest_behind = ingest_behind; + return db_->IngestExternalFile(column_family_handle(column_family), + external_files, opts); +} + } // namespace dftracer::utils::rocksdb diff --git a/src/dftracer/utils/core/rocksdb/db_manager.cpp b/src/dftracer/utils/core/rocksdb/db_manager.cpp index a9ae5c57..8c28d4ba 100644 --- a/src/dftracer/utils/core/rocksdb/db_manager.cpp +++ b/src/dftracer/utils/core/rocksdb/db_manager.cpp @@ -10,7 +10,8 @@ RocksDBManager& RocksDBManager::instance() { } std::shared_ptr RocksDBManager::get_or_open( - const std::string& db_path, RocksDatabase::OpenMode open_mode) { + const std::string& db_path, RocksDatabase::OpenMode open_mode, + RocksDatabase::CfOptionsOverride cf_override) { for (;;) { bool needs_upgrade = false; bool do_open = false; @@ -67,9 +68,13 @@ std::shared_ptr RocksDBManager::get_or_open( std::shared_ptr database; try { - database = std::make_shared( - db_path, - needs_upgrade ? RocksDatabase::OpenMode::ReadWrite : open_mode); + database = std::make_shared(); + if (cf_override) { + database->set_cf_options_override(std::move(cf_override)); + } + database->open(db_path, needs_upgrade + ? RocksDatabase::OpenMode::ReadWrite + : open_mode); } catch (...) { std::lock_guard lock(mutex_); opening_.erase(db_path); diff --git a/src/dftracer/utils/core/rocksdb/filesystem.cpp b/src/dftracer/utils/core/rocksdb/filesystem.cpp index 1d31f791..d0787ede 100644 --- a/src/dftracer/utils/core/rocksdb/filesystem.cpp +++ b/src/dftracer/utils/core/rocksdb/filesystem.cpp @@ -607,12 +607,12 @@ class DfTracerFileSystem final : public LocalFileSystemWrapper { ~DfTracerFileSystem() override { fallback_pool_.stop(); } - static const char* kClassName() { return "DfTracerFileSystem"; } + static const char* class_name() { return "DfTracerFileSystem"; } - const char* Name() const override { return kClassName(); } + const char* Name() const override { return class_name(); } bool IsInstanceOf(const std::string& name) const override { - return name == kClassName() || + return name == class_name() || LocalFileSystemWrapper::IsInstanceOf(name); } diff --git a/src/dftracer/utils/core/runtime.cpp b/src/dftracer/utils/core/runtime.cpp index b09c113e..1f8b3a01 100644 --- a/src/dftracer/utils/core/runtime.cpp +++ b/src/dftracer/utils/core/runtime.cpp @@ -77,6 +77,14 @@ TaskHandle Runtime::submit(coro::CoroTask task, std::string name) { p->set_value(); }; + // Set the executor on the task's promise so awaitables (e.g. channels) + // that capture `get_root_promise()->get_executor()` can schedule + // resumption. Without this, awaiters end up with executor=nullptr because + // the wrapping `coro::Coro` doesn't extend PromiseBase and the + // root-promise chain stops at the user's CoroTask. + if (task.handle()) { + task.handle().promise().set_executor(executor_.get()); + } auto coro = wrapper(std::move(task), promise, executor_.get(), tid); TaskIndex id = executor_->enqueue_tracked(std::move(coro), name, tid); @@ -142,4 +150,8 @@ void Runtime::shutdown() { std::size_t Runtime::threads() const { return threads_; } +std::size_t Runtime::io_threads() const { + return executor_ ? executor_->get_io_pool_size() : 0; +} + } // namespace dftracer::utils diff --git a/src/dftracer/utils/core/utils/timer.cpp b/src/dftracer/utils/core/utils/timer.cpp index cb3559f0..d7e879db 100644 --- a/src/dftracer/utils/core/utils/timer.cpp +++ b/src/dftracer/utils/core/utils/timer.cpp @@ -1,8 +1,11 @@ #include #include +#include #include #include +#include +#include namespace dftracer::utils { @@ -56,4 +59,40 @@ std::int64_t Timer::elapsed() const { } } +void Timer::increment(const std::string& key, std::uint64_t by) { + counters_[key] += by; +} + +void Timer::set_counter(const std::string& key, std::uint64_t value) { + counters_[key] = value; +} + +const std::unordered_map& Timer::counters() const { + return counters_; +} + +void Timer::print_stages(const std::string& prefix) const { + if (counters_.empty()) return; + + std::vector> sorted(counters_.begin(), + counters_.end()); + std::sort(sorted.begin(), sorted.end()); + + std::uint64_t total_ns = 0; + for (const auto& [_, ns] : sorted) total_ns += ns; + + if (!name_.empty()) { + std::printf("%s%s (%.2f ms)\n", prefix.c_str(), name_.c_str(), + static_cast(total_ns) / 1e6); + } + for (std::size_t i = 0; i < sorted.size(); ++i) { + const auto& [key, ns] = sorted[i]; + bool last = (i + 1 == sorted.size()); + double ms = static_cast(ns) / 1e6; + double pct = total_ns > 0 ? 100.0 * ns / total_ns : 0.0; + std::printf("%s%s %-28s %8.2f ms (%5.1f%%)\n", prefix.c_str(), + last ? "\\-- " : "|-- ", key.c_str(), ms, pct); + } +} + } // namespace dftracer::utils diff --git a/src/dftracer/utils/core/utils/timer.h b/src/dftracer/utils/core/utils/timer.h index 950dd88f..235612e1 100644 --- a/src/dftracer/utils/core/utils/timer.h +++ b/src/dftracer/utils/core/utils/timer.h @@ -2,7 +2,6 @@ #define DFTRACER_UTILS_CORE_UTILS_TIMER_H #include -#include #include #include #include @@ -18,6 +17,10 @@ class Timer { void start(); void stop(); std::int64_t elapsed() const; + void increment(const std::string& key, std::uint64_t by = 1); + void set_counter(const std::string& key, std::uint64_t value); + const std::unordered_map& counters() const; + void print_stages(const std::string& indent = " ") const; inline const std::string& name() const { return name_; } inline bool is_running() const { return running_; } @@ -48,6 +51,36 @@ class Timer { using Clock = std::chrono::high_resolution_clock; Clock::time_point start_time; Clock::time_point end_time; + std::unordered_map counters_; +}; + +/// Self-contained scoped timer. Each instance captures its own start +/// timestamp and writes elapsed nanoseconds to `timer->set_counter(key)` +/// on destruction, so ScopedTimers can be nested freely. +class ScopedTimer { + public: + ScopedTimer(Timer& timer, std::string key) + : timer_(&timer), key_(std::move(key)), start_(Clock::now()) {} + + ScopedTimer(Timer* timer, std::string key) + : timer_(timer), key_(std::move(key)), start_(Clock::now()) {} + + ~ScopedTimer() { + if (!timer_) return; + auto ns = std::chrono::duration_cast( + Clock::now() - start_) + .count(); + timer_->set_counter(key_, static_cast(ns)); + } + + ScopedTimer(const ScopedTimer&) = delete; + ScopedTimer& operator=(const ScopedTimer&) = delete; + + private: + using Clock = std::chrono::high_resolution_clock; + Timer* timer_; + std::string key_; + Clock::time_point start_; }; } // namespace dftracer::utils diff --git a/src/dftracer/utils/python/arrow_helpers.cpp b/src/dftracer/utils/python/arrow_helpers.cpp index 2141dc0b..9e0d1670 100644 --- a/src/dftracer/utils/python/arrow_helpers.cpp +++ b/src/dftracer/utils/python/arrow_helpers.cpp @@ -1,3 +1,4 @@ +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW #define PY_SSIZE_T_CLEAN @@ -47,6 +48,31 @@ PyObject *wrap_arrow_table(PyObject *batch_list) { return table; } +PyObject *wrap_arrow_stream_table(PyObject *stream_obj) { + if (!stream_obj) { + PyErr_SetString(PyExc_RuntimeError, "stream_obj is NULL"); + return NULL; + } + + PyObject *mod = PyImport_ImportModule("dftracer.utils.arrow"); + if (!mod) { + Py_DECREF(stream_obj); + return NULL; + } + + PyObject *cls = PyObject_GetAttrString(mod, "ArrowTable"); + Py_DECREF(mod); + if (!cls) { + Py_DECREF(stream_obj); + return NULL; + } + + PyObject *table = PyObject_CallFunctionObjArgs(cls, stream_obj, NULL); + Py_DECREF(cls); + Py_DECREF(stream_obj); + return table; +} + PyObject *arrow_result_to_table(ArrowExportResult result) { PyObject *capsule = wrap_arrow_result(std::move(result)); if (!capsule) return NULL; diff --git a/src/dftracer/utils/python/arrow_helpers.h b/src/dftracer/utils/python/arrow_helpers.h index b6b7621f..d4957880 100644 --- a/src/dftracer/utils/python/arrow_helpers.h +++ b/src/dftracer/utils/python/arrow_helpers.h @@ -1,6 +1,7 @@ #ifndef DFTRACER_UTILS_PYTHON_ARROW_HELPERS_H #define DFTRACER_UTILS_PYTHON_ARROW_HELPERS_H +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW #include @@ -23,6 +24,11 @@ PyObject *wrap_arrow_table(PyObject *batch_list); /// Returns a new reference, or NULL on error. PyObject *arrow_result_to_table(ArrowExportResult result); +/// Wrap an _ArrowBatchStream (or any __arrow_c_stream__ provider) in an +/// ArrowTable. Steals a reference to stream_obj on success. +/// Returns a new reference, or NULL on error. +PyObject *wrap_arrow_stream_table(PyObject *stream_obj); + } // namespace dftracer::utils::python #endif // DFTRACER_UTILS_ENABLE_ARROW diff --git a/src/dftracer/utils/python/arrow_parallel_reader.cpp b/src/dftracer/utils/python/arrow_parallel_reader.cpp new file mode 100644 index 00000000..2779672c --- /dev/null +++ b/src/dftracer/utils/python/arrow_parallel_reader.cpp @@ -0,0 +1,212 @@ +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#define PY_SSIZE_T_CLEAN +#include +#include +#include +#include +#include + +#include +#include + +namespace dftracer::utils::python { + +using utilities::common::arrow::ArrowExportResult; +using utilities::common::arrow::read_arrow_files_parallel; + +static PyObject* py_read_arrow_files_parallel(PyObject* /*self*/, + PyObject* args, + PyObject* kwargs) { + static const char* kwlist[] = {"paths", "runtime", nullptr}; + PyObject* paths_obj = nullptr; + PyObject* runtime_obj = nullptr; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", + const_cast(kwlist), &paths_obj, + &runtime_obj)) { + return nullptr; + } + + // Convert paths to vector + if (!PyList_Check(paths_obj)) { + PyErr_SetString(PyExc_TypeError, "paths must be a list of strings"); + return nullptr; + } + + Py_ssize_t n = PyList_Size(paths_obj); + std::vector paths; + paths.reserve(n); + + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject* item = PyList_GetItem(paths_obj, i); + if (!PyUnicode_Check(item)) { + PyErr_SetString(PyExc_TypeError, "all paths must be strings"); + return nullptr; + } + paths.push_back(PyUnicode_AsUTF8(item)); + } + + // Get runtime + Runtime* runtime = nullptr; + if (runtime_obj && runtime_obj != Py_None) { + if (!PyObject_TypeCheck(runtime_obj, &RuntimeType)) { + PyErr_SetString(PyExc_TypeError, + "runtime must be a Runtime object"); + return nullptr; + } + runtime = ((RuntimeObject*)runtime_obj)->runtime.get(); + } else { + runtime = get_default_runtime(); + } + + // Call C++ parallel reader (releases GIL during file I/O) + utilities::common::arrow::ParallelReadResult result; + bool had_error = false; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + auto task = read_arrow_files_parallel(std::move(paths)); + result = runtime->submit(std::move(task), "read_arrow_files").get(); + } catch (const std::exception& e) { + had_error = true; + error_msg = e.what(); + } catch (...) { + had_error = true; + error_msg = "Unknown error in read_arrow_files"; + } + Py_END_ALLOW_THREADS + + if (had_error) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + // Build Python result dict + PyObject* file_results_list = PyList_New(result.file_results.size()); + if (!file_results_list) return nullptr; + + for (std::size_t i = 0; i < result.file_results.size(); ++i) { + const auto& fr = result.file_results[i]; + PyObject* fr_dict = PyDict_New(); + if (!fr_dict) { + Py_DECREF(file_results_list); + return nullptr; + } + + // path + PyObject* path_str = PyUnicode_FromString(fr.path.c_str()); + PyDict_SetItemString(fr_dict, "path", path_str); + Py_DECREF(path_str); + + // success + PyDict_SetItemString(fr_dict, "success", + fr.success ? Py_True : Py_False); + + // error + if (!fr.error.empty()) { + PyObject* err_str = PyUnicode_FromString(fr.error.c_str()); + PyDict_SetItemString(fr_dict, "error", err_str); + Py_DECREF(err_str); + } else { + Py_INCREF(Py_None); + PyDict_SetItemString(fr_dict, "error", Py_None); + } + + // total_rows + PyObject* rows = PyLong_FromLongLong(fr.total_rows); + PyDict_SetItemString(fr_dict, "total_rows", rows); + Py_DECREF(rows); + + // batches - list of ArrowBatchCapsule objects + PyObject* batches_list = PyList_New(fr.batches->size()); + if (!batches_list) { + Py_DECREF(fr_dict); + Py_DECREF(file_results_list); + return nullptr; + } + + for (std::size_t j = 0; j < fr.batches->size(); ++j) { + ArrowBatchCapsuleObject* capsule = + (ArrowBatchCapsuleObject*)ArrowBatchCapsuleType.tp_alloc( + &ArrowBatchCapsuleType, 0); + if (!capsule) { + Py_DECREF(batches_list); + Py_DECREF(fr_dict); + Py_DECREF(file_results_list); + return nullptr; + } + // Move the batch into the capsule + capsule->result = + new ArrowExportResult(std::move((*fr.batches)[j])); + PyList_SetItem(batches_list, j, (PyObject*)capsule); + } + + PyDict_SetItemString(fr_dict, "batches", batches_list); + Py_DECREF(batches_list); + + PyList_SetItem(file_results_list, i, fr_dict); + } + + // Build final result dict + PyObject* result_dict = PyDict_New(); + if (!result_dict) { + Py_DECREF(file_results_list); + return nullptr; + } + + PyDict_SetItemString(result_dict, "file_results", file_results_list); + Py_DECREF(file_results_list); + + PyObject* total_rows = PyLong_FromLongLong(result.total_rows); + PyDict_SetItemString(result_dict, "total_rows", total_rows); + Py_DECREF(total_rows); + + PyObject* total_batches = PyLong_FromLongLong(result.total_batches); + PyDict_SetItemString(result_dict, "total_batches", total_batches); + Py_DECREF(total_batches); + + PyObject* files_read = PyLong_FromSize_t(result.files_read); + PyDict_SetItemString(result_dict, "files_read", files_read); + Py_DECREF(files_read); + + PyObject* files_failed = PyLong_FromSize_t(result.files_failed); + PyDict_SetItemString(result_dict, "files_failed", files_failed); + Py_DECREF(files_failed); + + return result_dict; +} + +static PyMethodDef arrow_parallel_reader_methods[] = { + {"read_arrow_files_parallel", (PyCFunction)py_read_arrow_files_parallel, + METH_VARARGS | METH_KEYWORDS, + "Read multiple Arrow IPC files in parallel using the Runtime.\n\n" + "Args:\n" + " paths: List of file paths to read.\n" + " runtime: Optional Runtime object. Uses default if not provided.\n\n" + "Returns:\n" + " dict with:\n" + " - file_results: List of per-file results, each with:\n" + " - path: File path\n" + " - success: True if read succeeded\n" + " - error: Error message if failed, else None\n" + " - total_rows: Number of rows in file\n" + " - batches: List of ArrowBatch objects\n" + " - total_rows: Total rows across all files\n" + " - total_batches: Total batches across all files\n" + " - files_read: Number of files read successfully\n" + " - files_failed: Number of files that failed"}, + {nullptr, nullptr, 0, nullptr}}; + +int init_arrow_parallel_reader(PyObject* m) { + // Add the function to the module + if (PyModule_AddFunctions(m, arrow_parallel_reader_methods) < 0) { + return -1; + } + return 0; +} + +} // namespace dftracer::utils::python + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC diff --git a/src/dftracer/utils/python/arrow_parallel_reader.h b/src/dftracer/utils/python/arrow_parallel_reader.h new file mode 100644 index 00000000..eee0041d --- /dev/null +++ b/src/dftracer/utils/python/arrow_parallel_reader.h @@ -0,0 +1,16 @@ +#ifndef DFTRACER_UTILS_PYTHON_ARROW_PARALLEL_READER_H +#define DFTRACER_UTILS_PYTHON_ARROW_PARALLEL_READER_H + +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#include + +namespace dftracer::utils::python { + +int init_arrow_parallel_reader(PyObject* m); + +} // namespace dftracer::utils::python + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC +#endif // DFTRACER_UTILS_PYTHON_ARROW_PARALLEL_READER_H diff --git a/src/dftracer/utils/python/arrow_stream_capsule.cpp b/src/dftracer/utils/python/arrow_stream_capsule.cpp new file mode 100644 index 00000000..56029e4c --- /dev/null +++ b/src/dftracer/utils/python/arrow_stream_capsule.cpp @@ -0,0 +1,323 @@ +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW + +#define PY_SSIZE_T_CLEAN +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using ArrowExportResult = + dftracer::utils::utilities::common::arrow::ArrowExportResult; + +namespace { + +// Drain until K consecutive batches add no new columns, bounded by MAX. +constexpr int STABLE_BATCHES = 5; +constexpr int MAX_DRAIN = 128; + +struct StreamPrivate { + std::shared_ptr state; + dftracer::utils::python::SchemaReconciler reconciler; + // Drained during discovery, emitted first from get_next. + std::deque pending; + std::string last_error; + bool initialized = false; + // Sticky: once set, all entry points short-circuit to EIO. + bool error_set = false; +}; + +static void mark_error(StreamPrivate *p, std::string msg) { + if (p->last_error.empty()) p->last_error = std::move(msg); + p->error_set = true; + p->initialized = true; +} + +static int initialize_stream(StreamPrivate *p) { + if (p->error_set) return EIO; + if (p->initialized) return 0; + auto *astate = p->state.get(); + + int stable_run = 0; + int drained = 0; + while (stable_run < STABLE_BATCHES && drained < MAX_DRAIN) { + auto batch = astate->channel->blocking_receive(); + if (!batch.has_value()) { + // End-of-stream or producer error before discovery converged. + std::lock_guard lock(astate->error_mtx); + if (astate->error) { + try { + std::rethrow_exception(astate->error); + } catch (const std::exception &e) { + mark_error(p, e.what()); + } catch (...) { + mark_error(p, "unknown error in Arrow stream"); + } + return EIO; + } + break; // clean early EOS; finalize with whatever we have + } + auto dequeued = dftracer::utils::python::byte_size(*batch); + astate->bytes_in_queue.fetch_sub(dequeued, std::memory_order_acq_rel); + + bool added = p->reconciler.merge(batch->get_schema()); + if (!p->reconciler.last_error().empty()) { + mark_error(p, p->reconciler.last_error()); + return EIO; + } + p->pending.push_back(std::move(*batch)); + stable_run = added ? 0 : (stable_run + 1); + ++drained; + } + + if (p->reconciler.finalize() != 0) { + mark_error(p, p->reconciler.last_error().empty() + ? "failed to finalize schema union" + : p->reconciler.last_error()); + return EIO; + } + p->initialized = true; + return 0; +} + +static int stream_get_schema(struct ArrowArrayStream *s, + struct ArrowSchema *out) { + auto *p = static_cast(s->private_data); + int rc = initialize_stream(p); + if (rc != 0) return rc; + if (p->error_set) return EIO; + if (p->reconciler.copy_schema(out) != 0) { + mark_error(p, p->reconciler.last_error().empty() + ? "failed to copy locked schema" + : p->reconciler.last_error()); + return EIO; + } + return 0; +} + +static int stream_get_next(struct ArrowArrayStream *s, struct ArrowArray *out) { + auto *p = static_cast(s->private_data); + if (p->error_set) return EIO; + if (!p->initialized) { + int rc = initialize_stream(p); + if (rc != 0) return rc; + } + + // Drain any discovery-phase batches first, then pull from the channel. + std::optional batch; + if (!p->pending.empty()) { + batch = std::move(p->pending.front()); + p->pending.pop_front(); + } else { + auto *astate = p->state.get(); + batch = astate->channel->blocking_receive(); + if (!batch.has_value()) { + std::lock_guard lock(astate->error_mtx); + if (astate->error) { + try { + std::rethrow_exception(astate->error); + } catch (const std::exception &e) { + mark_error(p, e.what()); + } catch (...) { + mark_error(p, "unknown error in Arrow stream"); + } + return EIO; + } + // End of stream per Arrow C spec: return success with + // out->release == nullptr. + out->release = nullptr; + return 0; + } + auto dequeued = dftracer::utils::python::byte_size(*batch); + astate->bytes_in_queue.fetch_sub(dequeued, std::memory_order_acq_rel); + } + + if (p->reconciler.reconcile(batch->get_schema(), batch->get_array(), out) != + 0) { + mark_error(p, p->reconciler.last_error().empty() + ? "schema reconciliation failed" + : p->reconciler.last_error()); + return EIO; + } + return 0; +} + +static const char *stream_get_last_error(struct ArrowArrayStream *s) { + auto *p = static_cast(s->private_data); + if (!p || p->last_error.empty()) return nullptr; + return p->last_error.c_str(); +} + +static void stream_release(struct ArrowArrayStream *s) { + auto *p = static_cast(s->private_data); + if (p) { + if (p->state) { + p->state->cancelled.store(true, std::memory_order_release); + if (p->state->channel) p->state->channel->close(); + if (p->state->task_future.valid()) { + // Release the GIL if this callback was invoked from a + // Python-holding context (e.g. capsule destructor during + // GC). If the GIL is not held (pyarrow's C reader path), + // _PyThreadState_UncheckedGet() returns null and we wait + // without touching the Python thread state. + if (Py_IsInitialized() && PyGILState_Check()) { + Py_BEGIN_ALLOW_THREADS p->state->task_future.wait(); + Py_END_ALLOW_THREADS + } else { + p->state->task_future.wait(); + } + } + } + delete p; + } + s->private_data = nullptr; + s->release = nullptr; +} + +static void release_stream_capsule(PyObject *capsule) { + auto *stream = static_cast( + PyCapsule_GetPointer(capsule, "arrow_array_stream")); + if (stream && stream->release) { + stream->release(stream); + } + delete stream; +} + +static PyObject *ArrowBatchStream_arrow_c_stream(ArrowBatchStreamObject *self, + PyObject *args) { + PyObject *requested_schema = Py_None; + if (!PyArg_ParseTuple(args, "|O", &requested_schema)) return NULL; + + // Per the PyCapsule protocol, a non-None `requested_schema` means the + // caller wants the stream cast to that schema. We only emit our native + // schema today; reject explicitly so misuse fails loudly instead of + // silently returning arrays that don't match what the caller asked for. + if (requested_schema != Py_None) { + PyErr_SetString(PyExc_NotImplementedError, + "iter_arrow_stream does not support " + "requested_schema casting; pass None to use the " + "native schema."); + return NULL; + } + + if (self->consumed || !self->state) { + PyErr_SetString(PyExc_RuntimeError, + "Arrow stream already exported via " + "__arrow_c_stream__; each stream can be " + "exported only once."); + return NULL; + } + + auto *priv = new StreamPrivate; + priv->state = self->state; + self->consumed = true; + self->state.reset(); + + auto *stream = new ArrowArrayStream; + std::memset(stream, 0, sizeof(*stream)); + stream->get_schema = stream_get_schema; + stream->get_next = stream_get_next; + stream->get_last_error = stream_get_last_error; + stream->release = stream_release; + stream->private_data = priv; + + PyObject *capsule = + PyCapsule_New(stream, "arrow_array_stream", release_stream_capsule); + if (!capsule) { + stream->release(stream); + delete stream; + return NULL; + } + return capsule; +} + +static void ArrowBatchStream_dealloc(ArrowBatchStreamObject *self) { + if (self->state) { + self->state->cancelled.store(true, std::memory_order_release); + if (self->state->channel) self->state->channel->close(); + Py_BEGIN_ALLOW_THREADS if (self->state->task_future.valid()) { + self->state->task_future.wait(); + } + Py_END_ALLOW_THREADS + } + self->state.~shared_ptr(); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyMethodDef ArrowBatchStream_methods[] = { + {"__arrow_c_stream__", (PyCFunction)ArrowBatchStream_arrow_c_stream, + METH_VARARGS, "Export as Arrow C Data Interface stream PyCapsule"}, + {NULL}}; + +} // namespace + +PyTypeObject ArrowBatchStreamType = { + PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext._ArrowBatchStream", + sizeof(ArrowBatchStreamObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)ArrowBatchStream_dealloc, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + Py_TPFLAGS_DEFAULT, + "Zero-iteration Arrow stream backed by a C++ coroutine channel", + 0, + 0, + 0, + 0, + 0, + 0, + ArrowBatchStream_methods, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, +}; + +int init_arrow_batch_stream(PyObject *m) { + if (PyType_Ready(&ArrowBatchStreamType) < 0) return -1; + Py_INCREF(&ArrowBatchStreamType); + if (PyModule_AddObject(m, "_ArrowBatchStream", + (PyObject *)&ArrowBatchStreamType) < 0) { + Py_DECREF(&ArrowBatchStreamType); + return -1; + } + return 0; +} + +PyObject *make_arrow_batch_stream(std::shared_ptr state) { + auto *obj = (ArrowBatchStreamObject *)ArrowBatchStreamType.tp_alloc( + &ArrowBatchStreamType, 0); + if (!obj) return NULL; + new (&obj->state) std::shared_ptr(std::move(state)); + obj->consumed = false; + return (PyObject *)obj; +} + +#endif diff --git a/src/dftracer/utils/python/arrow_stream_capsule.h b/src/dftracer/utils/python/arrow_stream_capsule.h new file mode 100644 index 00000000..4f0d7d03 --- /dev/null +++ b/src/dftracer/utils/python/arrow_stream_capsule.h @@ -0,0 +1,25 @@ +#ifndef DFTRACER_UTILS_PYTHON_ARROW_STREAM_CAPSULE_H +#define DFTRACER_UTILS_PYTHON_ARROW_STREAM_CAPSULE_H + +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW + +#define PY_SSIZE_T_CLEAN +#include +#include + +#include + +typedef struct { + PyObject_HEAD std::shared_ptr state; + bool consumed; +} ArrowBatchStreamObject; + +extern PyTypeObject ArrowBatchStreamType; + +int init_arrow_batch_stream(PyObject *m); + +PyObject *make_arrow_batch_stream(std::shared_ptr state); + +#endif +#endif diff --git a/src/dftracer/utils/python/batch_byte_size.h b/src/dftracer/utils/python/batch_byte_size.h new file mode 100644 index 00000000..3921cf41 --- /dev/null +++ b/src/dftracer/utils/python/batch_byte_size.h @@ -0,0 +1,55 @@ +#ifndef DFTRACER_UTILS_PYTHON_BATCH_BYTE_SIZE_H +#define DFTRACER_UTILS_PYTHON_BATCH_BYTE_SIZE_H + +#include +#include +#include + +#include + +#ifdef DFTRACER_UTILS_ENABLE_ARROW +#include +#endif + +namespace dftracer::utils::python { + +inline std::size_t byte_size(const MemoryViewBatchData &b) { + return b.buffer.capacity() + b.offsets.capacity() * sizeof(Py_ssize_t) + + b.lengths.capacity() * sizeof(Py_ssize_t); +} + +inline std::size_t byte_size(const JsonDictBatch &b) { + static constexpr std::size_t ESTIMATED_EVENT_BYTES = 512; + return b.events.capacity() * ESTIMATED_EVENT_BYTES; +} + +#ifdef DFTRACER_UTILS_ENABLE_ARROW + +inline std::size_t arrow_array_byte_size(const ArrowArray *arr) { + if (!arr || !arr->release) return 0; + std::size_t total = 0; + for (int64_t i = 0; i < arr->n_buffers; ++i) { + if (arr->buffers[i]) { + // For variable-length buffers, use length * estimated element size + // For validity/offset buffers, use (length + 7) / 8 or length * 4 + // Conservative estimate: buffer contributes proportionally to + // length + total += static_cast(arr->length) * 8; + } + } + for (int64_t i = 0; i < arr->n_children; ++i) { + total += arrow_array_byte_size(arr->children[i]); + } + return total; +} + +inline std::size_t byte_size( + const dftracer::utils::utilities::common::arrow::ArrowExportResult &b) { + return arrow_array_byte_size(b.get_array()); +} + +#endif // DFTRACER_UTILS_ENABLE_ARROW + +} // namespace dftracer::utils::python + +#endif // DFTRACER_UTILS_PYTHON_BATCH_BYTE_SIZE_H diff --git a/src/dftracer/utils/python/batch_indexer.cpp b/src/dftracer/utils/python/batch_indexer.cpp new file mode 100644 index 00000000..ef7bb8be --- /dev/null +++ b/src/dftracer/utils/python/batch_indexer.cpp @@ -0,0 +1,2554 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DFTRACER_UTILS_ENABLE_ARROW +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +using dftracer::utils::CoroScope; +using dftracer::utils::Runtime; +using dftracer::utils::coro::CoroTask; +using namespace dftracer::utils::utilities::composites::dft::indexing; +using namespace dftracer::utils::utilities::composites::dft::aggregators; + +// --------------------------------------------------------------------------- +// BatchIndexer - directory-level indexer with resolve/build pattern +// --------------------------------------------------------------------------- + +static void Indexer_dealloc(IndexerObject* self) { + Py_XDECREF(self->runtime_obj); + Py_XDECREF(self->directory); + Py_XDECREF(self->files); + Py_XDECREF(self->index_dir); + Py_XDECREF(self->group_keys); + Py_XDECREF(self->custom_metric_fields); + Py_TYPE(self)->tp_free((PyObject*)self); +} + +static PyObject* Indexer_new(PyTypeObject* type, PyObject* args, + PyObject* kwds) { + IndexerObject* self = (IndexerObject*)type->tp_alloc(type, 0); + if (self) { + self->runtime_obj = nullptr; + self->directory = nullptr; + self->files = nullptr; + self->index_dir = nullptr; + self->require_checkpoint = 1; + self->require_bloom = 1; + self->require_manifest = 1; + self->require_aggregation = 0; + self->time_interval_ms = 5000.0; + self->group_keys = nullptr; + self->custom_metric_fields = nullptr; + self->compute_percentiles = 0; + self->checkpoint_size = 32 * 1024 * 1024; + self->parallelism = 0; + self->force_rebuild = 0; + } + return (PyObject*)self; +} + +static int Indexer_init(IndexerObject* self, PyObject* args, PyObject* kwds) { + static const char* kwlist[] = {"directory", + "files", + "index_dir", + "require_checkpoint", + "require_bloom", + "require_manifest", + "require_aggregation", + "time_interval_ms", + "group_keys", + "custom_metric_fields", + "compute_percentiles", + "checkpoint_size", + "parallelism", + "force_rebuild", + "runtime", + nullptr}; + + const char* directory = ""; + PyObject* files_obj = Py_None; + const char* index_dir = ""; + int require_checkpoint = 1; + int require_bloom = 1; + int require_manifest = 1; + int require_aggregation = 0; + double time_interval_ms = 5000.0; + PyObject* group_keys_obj = Py_None; + PyObject* custom_metrics_obj = Py_None; + int compute_percentiles = 0; + Py_ssize_t checkpoint_size = 32 * 1024 * 1024; // 32MB default + Py_ssize_t parallelism = 0; + int force_rebuild = 0; + PyObject* runtime_arg = nullptr; + + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "|sOsppppdOOpnnpO", (char**)kwlist, &directory, + &files_obj, &index_dir, &require_checkpoint, &require_bloom, + &require_manifest, &require_aggregation, &time_interval_ms, + &group_keys_obj, &custom_metrics_obj, &compute_percentiles, + &checkpoint_size, ¶llelism, &force_rebuild, &runtime_arg)) { + return -1; + } + + // Validate: at least one of directory or files must be provided + bool has_directory = directory && directory[0] != '\0'; + bool has_files = files_obj && files_obj != Py_None && + PyList_Check(files_obj) && PyList_Size(files_obj) > 0; + + if (!has_directory && !has_files) { + PyErr_SetString(PyExc_ValueError, + "At least one of 'directory' or 'files' must be " + "provided"); + return -1; + } + + // Store runtime + if (runtime_arg && runtime_arg != Py_None) { + if (PyObject_TypeCheck(runtime_arg, &RuntimeType)) { + Py_INCREF(runtime_arg); + self->runtime_obj = runtime_arg; + } else { + PyObject* native = PyObject_GetAttrString(runtime_arg, "_native"); + if (native && PyObject_TypeCheck(native, &RuntimeType)) { + self->runtime_obj = native; + } else { + Py_XDECREF(native); + PyErr_SetString(PyExc_TypeError, + "runtime must be a Runtime instance or None"); + return -1; + } + } + } + + self->directory = PyUnicode_FromString(directory); + self->index_dir = PyUnicode_FromString(index_dir); + self->require_checkpoint = require_checkpoint; + self->require_bloom = require_bloom; + self->require_manifest = require_manifest; + self->require_aggregation = require_aggregation; + self->time_interval_ms = time_interval_ms; + self->compute_percentiles = compute_percentiles; + self->checkpoint_size = static_cast(checkpoint_size); + self->parallelism = static_cast(parallelism); + self->force_rebuild = force_rebuild; + + // Store files list + if (has_files) { + Py_INCREF(files_obj); + self->files = files_obj; + } else { + self->files = nullptr; + } + + // Store group_keys + if (group_keys_obj && group_keys_obj != Py_None) { + Py_INCREF(group_keys_obj); + self->group_keys = group_keys_obj; + } else { + self->group_keys = nullptr; + } + + // Store custom_metric_fields + if (custom_metrics_obj && custom_metrics_obj != Py_None) { + Py_INCREF(custom_metrics_obj); + self->custom_metric_fields = custom_metrics_obj; + } else { + self->custom_metric_fields = nullptr; + } + + return 0; +} + +static Runtime* get_batch_indexer_runtime(IndexerObject* self) { + if (self->runtime_obj) { + return ((RuntimeObject*)self->runtime_obj)->runtime.get(); + } + return get_default_runtime(); +} + +static std::optional build_aggregation_config( + IndexerObject* self) { + if (!self->require_aggregation) { + return std::nullopt; + } + + AggregationConfig config; + config.time_interval_us = + static_cast(self->time_interval_ms * 1000.0); + + if (self->group_keys && PyList_Check(self->group_keys)) { + Py_ssize_t n = PyList_Size(self->group_keys); + for (Py_ssize_t i = 0; i < n; i++) { + const char* s = + PyUnicode_AsUTF8(PyList_GetItem(self->group_keys, i)); + if (s) config.extra_group_keys.emplace_back(s); + } + } + if (self->custom_metric_fields && + PyList_Check(self->custom_metric_fields)) { + Py_ssize_t n = PyList_Size(self->custom_metric_fields); + for (Py_ssize_t i = 0; i < n; i++) { + const char* s = + PyUnicode_AsUTF8(PyList_GetItem(self->custom_metric_fields, i)); + if (s) config.custom_metric_fields.emplace_back(s); + } + } + + config.compute_percentiles = self->compute_percentiles != 0; + return config; +} + +// --------------------------------------------------------------------------- +// resolve() - check what exists vs needs building +// --------------------------------------------------------------------------- + +static PyObject* Indexer_resolve(IndexerObject* self, + PyObject* Py_UNUSED(ignored)) { + const char* directory = PyUnicode_AsUTF8(self->directory); + const char* index_dir = PyUnicode_AsUTF8(self->index_dir); + + ResolverInput input; + input.directory = directory ? directory : ""; + input.index_dir = index_dir ? index_dir : ""; + input.require_checkpoints = self->require_checkpoint; + input.require_bloom = self->require_bloom; + input.require_manifest = self->require_manifest; + input.require_aggregation = self->require_aggregation; + input.aggregation_config = build_aggregation_config(self); + + // Add files if provided + if (self->files && PyList_Check(self->files)) { + Py_ssize_t n = PyList_Size(self->files); + for (Py_ssize_t i = 0; i < n; i++) { + const char* s = PyUnicode_AsUTF8(PyList_GetItem(self->files, i)); + if (s) input.files.emplace_back(s); + } + } + + ResolverResult result; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + Runtime* rt = get_batch_indexer_runtime(self); + rt->submit(run_coro_scope( + rt->executor(), + [](CoroScope& scope, ResolverInput in, + ResolverResult* out) -> CoroTask { + IndexResolverUtility resolver; + // Use scope.spawn(utility, input) which auto-binds + // context for utilities with NeedsContext tag + *out = co_await scope.spawn(resolver, std::move(in)); + }, + std::move(input), &result), + "batch-indexer-resolve") + .get(); + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + // Build result dict + PyObject* dict = PyDict_New(); + if (!dict) return nullptr; + + PyDict_SetItemString(dict, "total_files", + PyLong_FromSize_t(result.all_files.size())); + PyDict_SetItemString(dict, "index_path", + PyUnicode_FromString(result.index_path.c_str())); + + // Ready files + PyObject* ready_list = PyList_New(result.cached.size()); + for (std::size_t i = 0; i < result.cached.size(); ++i) { + PyList_SetItem( + ready_list, i, + PyUnicode_FromString(result.cached[i].file_path.c_str())); + } + PyDict_SetItemString(dict, "ready", ready_list); + + // Needs work files (union of all needs_* lists) + std::vector needs_work; + for (const auto& item : result.needs_checkpoint) { + needs_work.push_back(item.file_path); + } + for (const auto& item : result.needs_bloom) { + bool found = false; + for (const auto& existing : needs_work) { + if (existing == item.file_path) { + found = true; + break; + } + } + if (!found) needs_work.push_back(item.file_path); + } + for (const auto& item : result.needs_manifest) { + bool found = false; + for (const auto& existing : needs_work) { + if (existing == item.file_path) { + found = true; + break; + } + } + if (!found) needs_work.push_back(item.file_path); + } + for (const auto& item : result.needs_aggregation) { + bool found = false; + for (const auto& existing : needs_work) { + if (existing == item.file_path) { + found = true; + break; + } + } + if (!found) needs_work.push_back(item.file_path); + } + + PyObject* needs_list = PyList_New(needs_work.size()); + for (std::size_t i = 0; i < needs_work.size(); ++i) { + PyList_SetItem(needs_list, i, + PyUnicode_FromString(needs_work[i].c_str())); + } + PyDict_SetItemString(dict, "needs_work", needs_list); + + return dict; +} + +// --------------------------------------------------------------------------- +// build() - build missing index tiers +// --------------------------------------------------------------------------- + +static PyObject* Indexer_build(IndexerObject* self, + PyObject* Py_UNUSED(ignored)) { + const char* directory = PyUnicode_AsUTF8(self->directory); + const char* index_dir = PyUnicode_AsUTF8(self->index_dir); + + ResolveAndBuildInput input; + input.directory = directory ? directory : ""; + input.index_dir = index_dir ? index_dir : ""; + input.require_checkpoints = self->require_checkpoint; + input.require_bloom = self->require_bloom; + input.require_manifest = self->require_manifest; + input.require_aggregation = self->require_aggregation; + input.aggregation_config = build_aggregation_config(self); + input.checkpoint_size = self->checkpoint_size; + input.parallelism = self->parallelism; + input.force_rebuild = self->force_rebuild; + + // Add files if provided + if (self->files && PyList_Check(self->files)) { + Py_ssize_t n = PyList_Size(self->files); + for (Py_ssize_t i = 0; i < n; i++) { + const char* s = PyUnicode_AsUTF8(PyList_GetItem(self->files, i)); + if (s) input.files.emplace_back(s); + } + } + + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + Runtime* rt = get_batch_indexer_runtime(self); + rt->submit(run_coro_scope( + rt->executor(), + [](CoroScope& scope, + ResolveAndBuildInput in) -> CoroTask { + co_await resolve_and_build_index(&scope, + std::move(in)); + }, + std::move(input)), + "batch-indexer-build") + .get(); + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + Py_RETURN_NONE; +} + +// --------------------------------------------------------------------------- +// ensure_indexed() - resolve + build if needed +// --------------------------------------------------------------------------- + +static PyObject* Indexer_ensure_indexed(IndexerObject* self, + PyObject* Py_UNUSED(ignored)) { + // First resolve + PyObject* status = Indexer_resolve(self, nullptr); + if (!status) return nullptr; + + // Check if needs_work is non-empty + PyObject* needs_work = PyDict_GetItemString(status, "needs_work"); + if (needs_work && PyList_Size(needs_work) > 0) { + Py_DECREF(status); + + // Build + PyObject* result = Indexer_build(self, nullptr); + if (!result) return nullptr; + Py_DECREF(result); + + // Re-resolve + status = Indexer_resolve(self, nullptr); + } + + return status; +} + +// --------------------------------------------------------------------------- +// get_checkpoint_indexer() - get a single-file checkpoint indexer +// --------------------------------------------------------------------------- + +static PyObject* Indexer_get_checkpoint_indexer(IndexerObject* self, + PyObject* args) { + const char* file_path = nullptr; + if (!PyArg_ParseTuple(args, "s", &file_path)) { + return nullptr; + } + + // Determine index path using BatchIndexer's index_dir setting + const char* index_dir = PyUnicode_AsUTF8(self->index_dir); + std::string index_path = dftracer::utils::utilities::composites::dft:: + internal::determine_index_path(file_path, index_dir ? index_dir : ""); + + // Create IndexerObject + CheckpointIndexerObject* indexer = + (CheckpointIndexerObject*)CheckpointIndexerType.tp_alloc( + &CheckpointIndexerType, 0); + if (!indexer) { + return nullptr; + } + + indexer->handle = nullptr; + indexer->gz_path = PyUnicode_FromString(file_path); + indexer->index_path = PyUnicode_FromString(index_path.c_str()); + indexer->checkpoint_size = self->checkpoint_size; + indexer->build_bloom = 0; + indexer->build_manifest = 0; + + // Share runtime reference + if (self->runtime_obj) { + Py_INCREF(self->runtime_obj); + indexer->runtime_obj = self->runtime_obj; + } else { + indexer->runtime_obj = nullptr; + } + + // Create the native handle + indexer->handle = dft_indexer_create(file_path, index_path.c_str(), + self->checkpoint_size, 0); + if (!indexer->handle) { + Py_DECREF((PyObject*)indexer); + PyErr_SetString(PyExc_RuntimeError, + "Failed to create checkpoint indexer"); + return nullptr; + } + + return (PyObject*)indexer; +} + +static std::optional resolve_index_path(IndexerObject* self) { + PyObject* status = Indexer_resolve(self, nullptr); + if (!status) return std::nullopt; + PyObject* obj = PyDict_GetItemString(status, "index_path"); + const char* path = obj ? PyUnicode_AsUTF8(obj) : nullptr; + if (!path || path[0] == '\0') { + Py_DECREF(status); + PyErr_SetString(PyExc_RuntimeError, "No index path available"); + return std::nullopt; + } + std::string result(path); + Py_DECREF(status); + return result; +} + +static PyObject* Indexer_get_hash_table(IndexerObject* self, PyObject* args) { + const char* type_str = nullptr; + if (!PyArg_ParseTuple(args, "s", &type_str)) { + return nullptr; + } + + using dftracer::utils::utilities::indexer::IndexDatabase; + using HashType = IndexDatabase::HashType; + + HashType type; + if (std::strcmp(type_str, "file") == 0) { + type = HashType::FILE; + } else if (std::strcmp(type_str, "host") == 0) { + type = HashType::HOST; + } else if (std::strcmp(type_str, "string") == 0) { + type = HashType::STRING; + } else if (std::strcmp(type_str, "proc") == 0) { + type = HashType::PROC; + } else { + PyErr_SetString(PyExc_ValueError, + "type must be 'file', 'host', 'string', or 'proc'"); + return nullptr; + } + + auto idx_opt = resolve_index_path(self); + if (!idx_opt) return nullptr; + std::string index_path = std::move(*idx_opt); + + std::unordered_map hash_map; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + hash_map = db.query_hash_table(type); + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + PyObject* dict = PyDict_New(); + if (!dict) return nullptr; + + for (const auto& [hash, name] : hash_map) { + PyObject* key = PyUnicode_FromStringAndSize(hash.data(), hash.size()); + PyObject* val = PyUnicode_FromStringAndSize(name.data(), name.size()); + PyDict_SetItem(dict, key, val); + Py_DECREF(key); + Py_DECREF(val); + } + + return dict; +} + +static PyObject* Indexer_query_file_pids(IndexerObject* self, PyObject* args) { + int file_id; + if (!PyArg_ParseTuple(args, "i", &file_id)) { + return nullptr; + } + + using dftracer::utils::utilities::indexer::IndexDatabase; + + auto idx_opt = resolve_index_path(self); + if (!idx_opt) return nullptr; + std::string index_path = std::move(*idx_opt); + + std::unordered_set pids; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + pids = db.query_file_pids(file_id); + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + PyObject* set = PySet_New(nullptr); + if (!set) return nullptr; + + for (auto pid : pids) { + PyObject* val = PyLong_FromUnsignedLongLong(pid); + PySet_Add(set, val); + Py_DECREF(val); + } + + return set; +} + +static PyObject* Indexer_query_all_file_pids(IndexerObject* self, + PyObject* Py_UNUSED(ignored)) { + using dftracer::utils::utilities::indexer::IndexDatabase; + + auto idx_opt = resolve_index_path(self); + if (!idx_opt) return nullptr; + std::string index_path = std::move(*idx_opt); + + std::unordered_map> all_pids; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + all_pids = db.query_all_file_pids(); + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + PyObject* dict = PyDict_New(); + if (!dict) return nullptr; + + for (const auto& [file_id, pids] : all_pids) { + PyObject* key = PyLong_FromLong(file_id); + PyObject* set = PySet_New(nullptr); + for (auto pid : pids) { + PyObject* val = PyLong_FromUnsignedLongLong(pid); + PySet_Add(set, val); + Py_DECREF(val); + } + PyDict_SetItem(dict, key, set); + Py_DECREF(key); + Py_DECREF(set); + } + + return dict; +} + +static PyObject* Indexer_query_file_info(IndexerObject* self, + PyObject* Py_UNUSED(ignored)) { + using dftracer::utils::utilities::indexer::IndexDatabase; + + auto idx_opt = resolve_index_path(self); + if (!idx_opt) return nullptr; + std::string index_path = std::move(*idx_opt); + + std::unordered_map file_ids; + std::unordered_map> all_pids; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + file_ids = db.query_all_file_info_ids(); + all_pids = db.query_all_file_pids(); + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + auto data_dir = fs::weakly_canonical(fs::path(index_path)).parent_path(); + + PyObject* id_to_path = PyDict_New(); + if (!id_to_path) return nullptr; + for (const auto& [logical_name, fid] : file_ids) { + auto resolved = (data_dir / logical_name).string(); + PyObject* key = PyLong_FromLong(fid); + PyObject* val = PyUnicode_FromStringAndSize( + resolved.data(), static_cast(resolved.size())); + PyDict_SetItem(id_to_path, key, val); + Py_DECREF(key); + Py_DECREF(val); + } + + PyObject* pid_dict = PyDict_New(); + if (!pid_dict) { + Py_DECREF(id_to_path); + return nullptr; + } + for (const auto& [file_id, pids] : all_pids) { + PyObject* key = PyLong_FromLong(file_id); + PyObject* set = PySet_New(nullptr); + for (auto pid : pids) { + PyObject* val = PyLong_FromUnsignedLongLong(pid); + PySet_Add(set, val); + Py_DECREF(val); + } + PyDict_SetItem(pid_dict, key, set); + Py_DECREF(key); + Py_DECREF(set); + } + + PyObject* result = PyTuple_Pack(2, id_to_path, pid_dict); + Py_DECREF(id_to_path); + Py_DECREF(pid_dict); + return result; +} + +#ifdef DFTRACER_UTILS_ENABLE_ARROW +#include +#include + +static PyObject* create_arrow_batch_capsule( + dftracer::utils::utilities::common::arrow::ArrowExportResult&& result) { + auto* obj = (ArrowBatchCapsuleObject*)ArrowBatchCapsuleType.tp_alloc( + &ArrowBatchCapsuleType, 0); + if (!obj) return nullptr; + obj->result = + new dftracer::utils::utilities::common::arrow::ArrowExportResult( + std::move(result)); + return (PyObject*)obj; +} + +namespace { + +using dftracer::utils::utilities::common::arrow::ArrowExportResult; +using dftracer::utils::utilities::common::arrow::ColumnSpec; +using dftracer::utils::utilities::common::arrow::ColumnType; +using dftracer::utils::utilities::common::arrow::RecordBatchBuilder; + +static bool parse_agg_type_str(const char* type_str, AggMapType& out) { + if (strcmp(type_str, "events") == 0) { + out = AggMapType::EVENT; + return true; + } + if (strcmp(type_str, "profiles") == 0) { + out = AggMapType::PROFILE; + return true; + } + if (strcmp(type_str, "system") == 0) { + out = AggMapType::SYSTEM; + return true; + } + PyErr_SetString(PyExc_ValueError, + "type must be 'events', 'profiles', or 'system'"); + return false; +} + +struct AggDbHandle { + std::shared_ptr db; + std::unique_ptr agg; +}; + +static std::unique_ptr open_agg_db(const std::string& index_path, + std::string& error_msg) { + std::shared_ptr db; + try { + db = EventAggregator::open_with_merge_operator(index_path); + } catch (...) { + auto& mgr = dftracer::utils::rocksdb::RocksDBManager::instance(); + mgr.reset(index_path); + db = mgr.get_or_open( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + if (db && db->is_open()) { + load_intern_dictionary(*db); + } + } + if (!db || !db->is_open()) { + error_msg = "Failed to open aggregation database"; + return nullptr; + } + std::string config_val; + auto key = std::string_view(AGG_GLOBAL_CONFIG_KEY, + sizeof(AGG_GLOBAL_CONFIG_KEY) - 1); + if (!db->get(key, &config_val, dftracer::utils::rocksdb::cf::AGGREGATION) + .ok()) { + error_msg = "No aggregation config found - was aggregation enabled?"; + return nullptr; + } + auto cfg = deserialize_agg_global_config(config_val); + auto handle = std::make_unique(); + handle->db = db; + handle->agg = std::make_unique(db, cfg.config_hash); + return handle; +} + +static std::optional +parse_query_arg(const char* query_str) { + if (!query_str || query_str[0] == '\0') return std::nullopt; + auto result = dftracer::utils::utilities::common::query::Query::from_string( + query_str); + if (!result) { + PyErr_SetString(PyExc_ValueError, result.error().message.c_str()); + return std::nullopt; + } + return std::move(*result); +} + +constexpr std::uint16_t DFT_NUM_SHARDS = 4096; + +template +void parallel_shard_scan_range(Runtime* rt, std::uint16_t outer_begin, + std::uint16_t outer_end, ScanFn&& scan_fn, + std::vector& outputs) { + if (outer_end <= outer_begin) return; + const std::size_t span = static_cast(outer_end - outer_begin); + const std::size_t num_tasks = std::min(rt->threads(), span); + const std::size_t shards_per_task = (span + num_tasks - 1) / num_tasks; + rt->submit(run_coro_scope( + rt->executor(), + [&](CoroScope& scope) -> CoroTask { + std::vector> + futures; + futures.reserve(num_tasks); + for (std::size_t t = 0; t < num_tasks; ++t) { + auto shard_begin = static_cast( + outer_begin + t * shards_per_task); + auto shard_end = + static_cast(std::min( + outer_begin + (t + 1) * shards_per_task, + outer_end)); + futures.push_back( + scope.spawn([&scan_fn, shard_begin, shard_end]( + CoroScope&) -> CoroTask { + co_return scan_fn(shard_begin, shard_end); + })); + } + outputs.reserve(num_tasks); + for (auto& f : futures) { + outputs.push_back(co_await f); + } + }), + "parallel-shard-scan-range") + .get(); +} + +template +void parallel_shard_scan(Runtime* rt, ScanFn&& scan_fn, + std::vector& outputs) { + parallel_shard_scan_range(rt, 0, DFT_NUM_SHARDS, + std::forward(scan_fn), outputs); +} + +static void append_results_to_list(PyObject* list, + std::vector& results) { + for (auto& r : results) { + PyObject* capsule = create_arrow_batch_capsule(std::move(r)); + if (capsule) { + PyList_Append(list, capsule); + Py_DECREF(capsule); + } + } +} + +struct AggScanInput { + const EventAggregator* agg; + AggMapType target_type; + AggregationBatchType batch_type; + Py_ssize_t batch_size; + std::uint16_t shard_begin; + std::uint16_t shard_end; +}; + +struct AggScanOutput { + std::vector results; +}; + +AggScanOutput scan_aggregation_shard_range(AggScanInput input) { + AggScanOutput output; + + static const std::vector schema = { + {"batch_type", ColumnType::INT64}, {"cat", ColumnType::DICT_STRING}, + {"name", ColumnType::DICT_STRING}, {"pid", ColumnType::UINT64}, + {"tid", ColumnType::UINT64}, {"hhash", ColumnType::DICT_STRING}, + {"fhash", ColumnType::DICT_STRING}, {"time_bucket", ColumnType::UINT64}, + {"count", ColumnType::UINT64}, {"dur_total", ColumnType::UINT64}, + {"dur_min", ColumnType::UINT64}, {"dur_max", ColumnType::UINT64}, + {"dur_mean", ColumnType::DOUBLE}, {"dur_std", ColumnType::DOUBLE}, + {"size_total", ColumnType::UINT64}, {"size_min", ColumnType::UINT64}, + {"size_max", ColumnType::UINT64}, {"size_mean", ColumnType::DOUBLE}, + {"size_std", ColumnType::DOUBLE}, {"ts", ColumnType::UINT64}, + {"te", ColumnType::UINT64}, + }; + + RecordBatchBuilder builder; + builder.declare_schema(schema); + builder.reserve(static_cast(input.batch_size)); + + std::size_t row_count = 0; + + input.agg->scan_shard_range_raw( + input.shard_begin, input.shard_end, + [&](std::string_view key_bytes, std::string_view val_bytes) -> bool { + AggKeyView kv; + if (!parse_agg_key_view(key_bytes, kv)) return true; + if (kv.map_type != input.target_type) return true; + + AggMetricsFullView mv; + if (!parse_agg_value_full_view(val_bytes, mv)) return true; + + std::size_t ci = 0; + builder.append_int64(ci++, + static_cast(input.batch_type)); + builder.append_dict_string(ci++, kv.cat); + builder.append_dict_string(ci++, kv.name); + builder.append_uint64(ci++, kv.pid); + builder.append_uint64(ci++, kv.tid); + builder.append_dict_string(ci++, kv.hhash); + builder.append_dict_string(ci++, kv.fhash); + builder.append_uint64(ci++, kv.time_bucket); + builder.append_uint64(ci++, mv.count); + builder.append_uint64(ci++, mv.dur_total); + builder.append_uint64(ci++, mv.count > 0 ? mv.dur_min : 0); + builder.append_uint64(ci++, mv.dur_max); + builder.append_double(ci++, mv.dur_mean); + builder.append_double(ci++, mv.dur_stddev()); + builder.append_uint64(ci++, mv.size_total); + builder.append_uint64(ci++, mv.count > 0 ? mv.size_min : 0); + builder.append_uint64(ci++, mv.size_max); + builder.append_double(ci++, mv.size_mean); + builder.append_double(ci++, mv.size_stddev()); + builder.append_uint64(ci++, mv.ts); + builder.append_uint64(ci++, mv.te); + builder.end_row(); + + row_count++; + if (static_cast(row_count) >= input.batch_size) { + auto arrow = builder.finish(); + if (arrow.valid()) { + output.results.push_back(std::move(arrow)); + } + builder.reset(true); + builder.reserve(static_cast(input.batch_size)); + row_count = 0; + } + return true; + }); + + if (row_count > 0) { + auto arrow = builder.finish(); + if (arrow.valid()) { + output.results.push_back(std::move(arrow)); + } + } + + return output; +} + +enum class IOCategory : std::int8_t { + READ = 1, + WRITE = 2, + METADATA = 3, + PCTL = 4, + IPC = 5, + OTHER = 6, + SYNC = 7, +}; + +inline IOCategory get_io_category(std::string_view func_name) { + if (func_name == "read" || func_name == "pread" || func_name == "readv" || + func_name == "preadv" || func_name == "fread") { + return IOCategory::READ; + } + if (func_name == "write" || func_name == "pwrite" || + func_name == "writev" || func_name == "pwritev" || + func_name == "fwrite") { + return IOCategory::WRITE; + } + if (func_name == "fsync" || func_name == "fdatasync" || + func_name == "msync" || func_name == "sync") { + return IOCategory::SYNC; + } + if (func_name == "open" || func_name == "open64" || func_name == "close" || + func_name == "fopen" || func_name == "fopen64" || + func_name == "fclose" || func_name == "stat" || func_name == "fstat" || + func_name == "lstat" || func_name == "fstatat" || + func_name == "__xstat" || func_name == "__xstat64" || + func_name == "__lxstat" || func_name == "__lxstat64" || + func_name == "__fxstat" || func_name == "__fxstat64" || + func_name == "access" || func_name == "lseek" || + func_name == "lseek64" || func_name == "fseek" || + func_name == "ftell" || func_name == "seek" || func_name == "fcntl" || + func_name == "ftruncate" || func_name == "mkdir" || + func_name == "rmdir" || func_name == "unlink" || + func_name == "remove" || func_name == "rename" || func_name == "link" || + func_name == "readlink" || func_name == "opendir" || + func_name == "closedir" || func_name == "readdir") { + return IOCategory::METADATA; + } + return IOCategory::OTHER; +} + +inline char* fast_itoa(std::uint64_t val, char* buf) { + char* p = buf; + do { + *p++ = '0' + (val % 10); + val /= 10; + } while (val); + std::reverse(buf, p); + return p; +} + +class HashResolver { + public: + HashResolver( + const std::unordered_map* file_hashes, + const std::unordered_map* host_hashes) + : file_hashes_(file_hashes), host_hashes_(host_hashes) { + if (file_hashes_) { + for (const auto& [hash, name] : *file_hashes_) { + auto hash_sv = intern_.intern(hash); + auto name_sv = intern_.intern(name); + file_map_[hash_sv] = name_sv; + } + } + if (host_hashes_) { + for (const auto& [hash, name] : *host_hashes_) { + auto hash_sv = intern_.intern(hash); + auto name_sv = intern_.intern(name); + host_map_[hash_sv] = name_sv; + } + } + } + + std::string_view resolve_file(std::string_view hash) { + if (hash.empty()) return hash; + auto interned = intern_.intern(hash); + auto it = file_map_.find(interned); + return it != file_map_.end() ? it->second : interned; + } + + std::string_view resolve_host(std::string_view hash) { + if (hash.empty()) return hash; + auto interned = intern_.intern(hash); + auto it = host_map_.find(interned); + return it != host_map_.end() ? it->second : interned; + } + + std::string_view intern(std::string_view sv) { return intern_.intern(sv); } + + private: + const std::unordered_map* file_hashes_; + const std::unordered_map* host_hashes_; + dftracer::utils::StringIntern intern_; + std::unordered_map file_map_; + std::unordered_map host_map_; +}; + +struct ProcKey { + std::string_view hhash; + std::uint64_t pid; + std::uint64_t tid; + bool operator==(const ProcKey& o) const { + return hhash == o.hhash && pid == o.pid && tid == o.tid; + } +}; + +struct ProcKeyHash { + std::size_t operator()(const ProcKey& k) const { + return std::hash{}(k.hhash) ^ + (std::hash{}(k.pid) << 1) ^ + (std::hash{}(k.tid) << 2); + } +}; + +static const std::vector DFANALYZER_SCHEMA = { + {"cat", ColumnType::DICT_STRING}, + {"func_name", ColumnType::DICT_STRING}, + {"pid", ColumnType::INT64}, + {"tid", ColumnType::INT64}, + {"file_hash", ColumnType::DICT_STRING}, + {"host_hash", ColumnType::DICT_STRING}, + {"file_name", ColumnType::DICT_STRING}, + {"host_name", ColumnType::DICT_STRING}, + {"proc_name", ColumnType::DICT_STRING}, + {"io_cat", ColumnType::INT64}, + {"acc_pat", ColumnType::INT64}, + {"count", ColumnType::INT64}, + {"time", ColumnType::DOUBLE}, + {"size", ColumnType::INT64}, + {"time_min", ColumnType::DOUBLE}, + {"time_max", ColumnType::DOUBLE}, + {"size_min", ColumnType::INT64}, + {"size_max", ColumnType::INT64}, + {"time_range", ColumnType::INT64}, + {"time_start", ColumnType::INT64}, + {"time_end", ColumnType::INT64}, +}; + +enum GroupByField : std::uint32_t { + GB_CAT = 1u << 0, + GB_FUNC_NAME = 1u << 1, + GB_PID = 1u << 2, + GB_TID = 1u << 3, + GB_FILE_HASH = 1u << 4, + GB_HOST_HASH = 1u << 5, + GB_FILE_NAME = 1u << 6, + GB_HOST_NAME = 1u << 7, + GB_PROC_NAME = 1u << 8, + GB_IO_CAT = 1u << 9, + GB_ACC_PAT = 1u << 10, + GB_TIME_RANGE = 1u << 11, +}; + +struct GroupByConfig { + std::uint32_t mask = 0; + std::vector order; + std::vector names; // matches `order`, used for schema +}; + +inline std::optional parse_group_by_name(std::string_view name) { + if (name == "cat") return GB_CAT; + if (name == "func_name") return GB_FUNC_NAME; + if (name == "pid") return GB_PID; + if (name == "tid") return GB_TID; + if (name == "file_hash") return GB_FILE_HASH; + if (name == "host_hash") return GB_HOST_HASH; + if (name == "file_name") return GB_FILE_NAME; + if (name == "host_name") return GB_HOST_NAME; + if (name == "proc_name") return GB_PROC_NAME; + if (name == "io_cat") return GB_IO_CAT; + if (name == "acc_pat") return GB_ACC_PAT; + if (name == "time_range") return GB_TIME_RANGE; + return std::nullopt; +} + +struct CoarseKey { + std::string_view cat; + std::string_view func_name; + std::uint64_t pid = 0; + std::uint64_t tid = 0; + std::string_view file_hash; + std::string_view host_hash; + std::string_view file_name; + std::string_view host_name; + std::string_view proc_name; + std::int64_t io_cat = 0; + std::int64_t acc_pat = 0; + std::int64_t time_range = 0; + + bool operator==(const CoarseKey& o) const { + return cat == o.cat && func_name == o.func_name && pid == o.pid && + tid == o.tid && file_hash == o.file_hash && + host_hash == o.host_hash && file_name == o.file_name && + host_name == o.host_name && proc_name == o.proc_name && + io_cat == o.io_cat && acc_pat == o.acc_pat && + time_range == o.time_range; + } +}; + +struct CoarseKeyHash { + std::size_t operator()(const CoarseKey& k) const { + auto combine = [](std::size_t h, std::size_t v) { + return h ^ (v + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2)); + }; + std::size_t h = std::hash{}(k.cat); + h = combine(h, std::hash{}(k.func_name)); + h = combine(h, std::hash{}(k.pid)); + h = combine(h, std::hash{}(k.tid)); + h = combine(h, std::hash{}(k.file_hash)); + h = combine(h, std::hash{}(k.host_hash)); + h = combine(h, std::hash{}(k.file_name)); + h = combine(h, std::hash{}(k.host_name)); + h = combine(h, std::hash{}(k.proc_name)); + h = combine(h, std::hash{}(k.io_cat)); + h = combine(h, std::hash{}(k.acc_pat)); + h = combine(h, std::hash{}(k.time_range)); + return h; + } +}; + +struct CoarseMetrics { + std::uint64_t count = 0; + double time_sum = 0.0; + double time_sq_sum = 0.0; + double time_min_val = std::numeric_limits::infinity(); + double time_max_val = -std::numeric_limits::infinity(); + double time_call_min_val = std::numeric_limits::infinity(); + double time_call_max_val = -std::numeric_limits::infinity(); + std::uint64_t size_sum = 0; + double size_sq_sum = 0.0; + std::uint64_t size_min_val = std::numeric_limits::max(); + std::uint64_t size_max_val = 0; + std::uint64_t size_call_min_val = std::numeric_limits::max(); + std::uint64_t size_call_max_val = 0; + bool has_size = false; + std::uint64_t time_start_val = std::numeric_limits::max(); + std::uint64_t time_end_val = 0; + bool has_time_bounds = false; +}; + +inline std::vector make_coarse_schema(const GroupByConfig& cfg) { + std::vector specs; + specs.reserve(cfg.order.size() + 16); + for (std::size_t i = 0; i < cfg.order.size(); ++i) { + GroupByField f = cfg.order[i]; + const std::string& name = cfg.names[i]; + switch (f) { + case GB_CAT: + case GB_FUNC_NAME: + case GB_FILE_HASH: + case GB_HOST_HASH: + case GB_FILE_NAME: + case GB_HOST_NAME: + case GB_PROC_NAME: + specs.push_back({name, ColumnType::DICT_STRING}); + break; + case GB_PID: + case GB_TID: + case GB_IO_CAT: + case GB_ACC_PAT: + case GB_TIME_RANGE: + specs.push_back({name, ColumnType::INT64}); + break; + } + } + specs.push_back({"count", ColumnType::INT64}); + specs.push_back({"time", ColumnType::DOUBLE}); + specs.push_back({"size", ColumnType::INT64}); + specs.push_back({"time_sq", ColumnType::DOUBLE}); + specs.push_back({"size_sq", ColumnType::DOUBLE}); + specs.push_back({"time_min", ColumnType::DOUBLE}); + specs.push_back({"time_max", ColumnType::DOUBLE}); + specs.push_back({"size_min", ColumnType::INT64}); + specs.push_back({"size_max", ColumnType::INT64}); + specs.push_back({"time_call_min", ColumnType::DOUBLE}); + specs.push_back({"time_call_max", ColumnType::DOUBLE}); + specs.push_back({"size_call_min", ColumnType::INT64}); + specs.push_back({"size_call_max", ColumnType::INT64}); + specs.push_back({"time_start", ColumnType::INT64}); + specs.push_back({"time_end", ColumnType::INT64}); + return specs; +} + +struct DfanalyzerScanInput { + const EventAggregator* agg; + const DfanalyzerContext* ctx; + std::optional type_filter; + Py_ssize_t batch_size; + std::uint16_t shard_begin; + std::uint16_t shard_end; + const GroupByConfig* group_by = nullptr; // null = full granularity +}; + +struct DfanalyzerScanOutput { + std::vector events; + std::vector profiles; + std::vector system; +}; + +DfanalyzerScanOutput scan_dfanalyzer_shards(DfanalyzerScanInput input) { + DfanalyzerScanOutput output; + + const bool coarse = input.group_by != nullptr; + const std::vector coarse_schema = + coarse ? make_coarse_schema(*input.group_by) + : std::vector{}; + + auto make_builder = [&]() { + RecordBatchBuilder b; + if (coarse) { + b.declare_schema(coarse_schema); + } else { + b.declare_schema(DFANALYZER_SCHEMA); + } + b.reserve(static_cast(input.batch_size)); + return b; + }; + + RecordBatchBuilder event_builder, profile_builder, system_builder; + bool use_events = + !input.type_filter || *input.type_filter == AggMapType::EVENT; + bool use_profiles = + !input.type_filter || *input.type_filter == AggMapType::PROFILE; + bool use_system = + !input.type_filter || *input.type_filter == AggMapType::SYSTEM; + + if (use_events) event_builder = make_builder(); + if (use_profiles) profile_builder = make_builder(); + if (use_system) system_builder = make_builder(); + + auto bucket_width_us = static_cast( + input.ctx->time_granularity * input.ctx->time_resolution); + std::size_t event_count = 0, profile_count = 0, system_count = 0; + + HashResolver resolver(input.ctx->file_hashes, input.ctx->host_hashes); + std::unordered_map proc_name_cache; + std::unordered_map io_cat_cache; + + std::unordered_map event_coarse, + profile_coarse, system_coarse; + + auto flush_builder = [&](RecordBatchBuilder& builder, std::size_t& count, + std::vector& results) { + if (count > 0) { + auto arrow = builder.finish(); + if (arrow.valid()) { + results.push_back(std::move(arrow)); + } + builder.reset(true); + builder.reserve(static_cast(input.batch_size)); + count = 0; + } + }; + + auto append_row = [&](RecordBatchBuilder& builder, std::size_t& count, + std::vector& results, + const AggKeyView& kv, const AggMetricsView& mv, + std::string_view file_name, + std::string_view host_name, + std::string_view proc_name, IOCategory io_cat) { + std::size_t ci = 0; + builder.append_dict_string(ci++, kv.cat); + builder.append_dict_string(ci++, kv.name); + builder.append_int64(ci++, static_cast(kv.pid)); + builder.append_int64(ci++, static_cast(kv.tid)); + builder.append_dict_string(ci++, kv.fhash); + builder.append_dict_string(ci++, kv.hhash); + builder.append_dict_string(ci++, file_name); + builder.append_dict_string(ci++, host_name); + builder.append_dict_string(ci++, proc_name); + builder.append_int64(ci++, static_cast(io_cat)); + builder.append_int64(ci++, 0); + + builder.append_int64(ci++, static_cast(mv.count)); + builder.append_double(ci++, static_cast(mv.dur_total) / + input.ctx->time_resolution); + + if (mv.size_total > 0) { + builder.append_int64(ci++, + static_cast(mv.size_total)); + } else { + builder.append_null(ci++); + } + + builder.append_double(ci++, mv.count > 0 + ? static_cast(mv.dur_min) / + input.ctx->time_resolution + : 0.0); + builder.append_double(ci++, mv.count > 0 + ? static_cast(mv.dur_max) / + input.ctx->time_resolution + : 0.0); + + if (mv.size_total > 0 && mv.count > 0) { + builder.append_int64(ci++, static_cast(mv.size_min)); + builder.append_int64(ci++, static_cast(mv.size_max)); + } else { + builder.append_null(ci++); + builder.append_null(ci++); + } + + auto time_range = bucket_width_us > 0 + ? static_cast( + (kv.time_bucket - input.ctx->time_origin) / + bucket_width_us) + : 0; + builder.append_int64(ci++, time_range); + builder.append_int64( + ci++, static_cast(mv.ts - input.ctx->time_origin)); + builder.append_int64( + ci++, static_cast(mv.te - input.ctx->time_origin)); + builder.end_row(); + + count++; + if (static_cast(count) >= input.batch_size) { + flush_builder(builder, count, results); + } + }; + + auto accumulate_coarse = + [&](std::unordered_map& map, + const AggKeyView& kv, const AggMetricsView& mv, + std::string_view file_name, std::string_view host_name, + std::string_view proc_name, IOCategory io_cat) { + const auto& cfg = *input.group_by; + // Probe with non-interned views; hash/equality compare by content, + // so string_view lifetime doesn't matter for lookup. We only copy + // (intern) on first insert. + CoarseKey probe; + if (cfg.mask & GB_CAT) probe.cat = kv.cat; + if (cfg.mask & GB_FUNC_NAME) probe.func_name = kv.name; + if (cfg.mask & GB_PID) probe.pid = kv.pid; + if (cfg.mask & GB_TID) probe.tid = kv.tid; + if (cfg.mask & GB_FILE_HASH) probe.file_hash = kv.fhash; + if (cfg.mask & GB_HOST_HASH) probe.host_hash = kv.hhash; + if (cfg.mask & GB_FILE_NAME) probe.file_name = file_name; + if (cfg.mask & GB_HOST_NAME) probe.host_name = host_name; + if (cfg.mask & GB_PROC_NAME) probe.proc_name = proc_name; + if (cfg.mask & GB_IO_CAT) + probe.io_cat = static_cast(io_cat); + if (cfg.mask & GB_TIME_RANGE) { + probe.time_range = + bucket_width_us > 0 + ? static_cast( + (kv.time_bucket - input.ctx->time_origin) / + bucket_width_us) + : 0; + } + // acc_pat is always 0 today; included for completeness. + + auto it = map.find(probe); + if (it == map.end()) { + // First sighting: promote views referencing unstable DB buffers + // to interned copies. file_name/host_name come from the + // resolver's intern pool, and proc_name from proc_name_cache; + // both already stable across iterations, no copy needed. + CoarseKey stable = probe; + if (cfg.mask & GB_CAT) stable.cat = resolver.intern(kv.cat); + if (cfg.mask & GB_FUNC_NAME) + stable.func_name = resolver.intern(kv.name); + if (cfg.mask & GB_FILE_HASH) + stable.file_hash = resolver.intern(kv.fhash); + if (cfg.mask & GB_HOST_HASH) + stable.host_hash = resolver.intern(kv.hhash); + auto [nit, _] = map.emplace(std::move(stable), CoarseMetrics{}); + it = nit; + } + CoarseMetrics& m = it->second; + m.count += mv.count; + double time_val = + static_cast(mv.dur_total) / input.ctx->time_resolution; + m.time_sum += time_val; + m.time_sq_sum += time_val * time_val; + if (time_val < m.time_call_min_val) m.time_call_min_val = time_val; + if (time_val > m.time_call_max_val) m.time_call_max_val = time_val; + if (mv.count > 0) { + double dur_min_v = static_cast(mv.dur_min) / + input.ctx->time_resolution; + double dur_max_v = static_cast(mv.dur_max) / + input.ctx->time_resolution; + if (dur_min_v < m.time_min_val) m.time_min_val = dur_min_v; + if (dur_max_v > m.time_max_val) m.time_max_val = dur_max_v; + } + if (mv.size_total > 0) { + m.has_size = true; + m.size_sum += mv.size_total; + double sz = static_cast(mv.size_total); + m.size_sq_sum += sz * sz; + if (mv.size_total < m.size_call_min_val) + m.size_call_min_val = mv.size_total; + if (mv.size_total > m.size_call_max_val) + m.size_call_max_val = mv.size_total; + if (mv.count > 0) { + if (mv.size_min < m.size_min_val) + m.size_min_val = mv.size_min; + if (mv.size_max > m.size_max_val) + m.size_max_val = mv.size_max; + } + } + if (mv.ts >= input.ctx->time_origin) { + m.has_time_bounds = true; + auto ts_off = mv.ts - input.ctx->time_origin; + auto te_off = mv.te - input.ctx->time_origin; + if (ts_off < m.time_start_val) m.time_start_val = ts_off; + if (te_off > m.time_end_val) m.time_end_val = te_off; + } + }; + + input.agg->scan_shard_range_raw( + input.shard_begin, input.shard_end, + [&](std::string_view key_bytes, std::string_view val_bytes) -> bool { + AggKeyView kv; + if (!parse_agg_key_view(key_bytes, kv)) return true; + + if (input.type_filter && kv.map_type != *input.type_filter) + return true; + + if (input.ctx->query_filter) { + auto& q = *input.ctx->query_filter; + dftracer::utils::utilities::common::query::ValueMap fields; + if (q.references("cat")) fields["cat"] = std::string(kv.cat); + if (q.references("name")) fields["name"] = std::string(kv.name); + if (q.references("pid")) fields["pid"] = kv.pid; + if (q.references("tid")) fields["tid"] = kv.tid; + if (q.references("hhash")) + fields["hhash"] = std::string(kv.hhash); + if (q.references("fhash")) + fields["fhash"] = std::string(kv.fhash); + if (q.references("time_bucket")) + fields["time_bucket"] = kv.time_bucket; + if (!q.evaluate(fields)) return true; + } + + AggMetricsView mv; + if (!parse_agg_value_view(val_bytes, mv)) return true; + + auto file_name = resolver.resolve_file(kv.fhash); + auto host_name = resolver.resolve_host(kv.hhash); + + ProcKey pk{kv.hhash, kv.pid, kv.tid}; + auto proc_it = proc_name_cache.find(pk); + std::string_view proc_name; + if (proc_it != proc_name_cache.end()) { + proc_name = proc_it->second; + } else { + std::string pn = "app#"; + if (!host_name.empty()) { + pn.append(host_name); + } else if (!kv.hhash.empty()) { + pn.append(kv.hhash); + } else { + pn.append("unknown"); + } + pn.push_back('#'); + pn.append(std::to_string(kv.pid)); + pn.push_back('#'); + pn.append(std::to_string(kv.tid)); + ProcKey stable_pk{resolver.intern(kv.hhash), kv.pid, kv.tid}; + auto [it, _] = + proc_name_cache.emplace(stable_pk, std::move(pn)); + proc_name = it->second; + } + + auto io_it = io_cat_cache.find(kv.name); + IOCategory io_cat; + if (io_it != io_cat_cache.end()) { + io_cat = io_it->second; + } else { + io_cat = get_io_category(kv.name); + io_cat_cache[resolver.intern(kv.name)] = io_cat; + } + + if (coarse) { + switch (kv.map_type) { + case AggMapType::EVENT: + if (use_events) + accumulate_coarse(event_coarse, kv, mv, file_name, + host_name, proc_name, io_cat); + break; + case AggMapType::PROFILE: + if (use_profiles) + accumulate_coarse(profile_coarse, kv, mv, file_name, + host_name, proc_name, io_cat); + break; + case AggMapType::SYSTEM: + if (use_system) + accumulate_coarse(system_coarse, kv, mv, file_name, + host_name, proc_name, io_cat); + break; + } + } else { + switch (kv.map_type) { + case AggMapType::EVENT: + append_row(event_builder, event_count, output.events, + kv, mv, file_name, host_name, proc_name, + io_cat); + break; + case AggMapType::PROFILE: + append_row(profile_builder, profile_count, + output.profiles, kv, mv, file_name, + host_name, proc_name, io_cat); + break; + case AggMapType::SYSTEM: + append_row(system_builder, system_count, output.system, + kv, mv, file_name, host_name, proc_name, + io_cat); + break; + } + } + return true; + }); + + if (coarse) { + const auto& cfg = *input.group_by; + auto flush_coarse = [&](std::unordered_map& map, + RecordBatchBuilder& builder, std::size_t& count, + std::vector& results) { + for (auto& [key, m] : map) { + std::size_t ci = 0; + for (std::size_t i = 0; i < cfg.order.size(); ++i) { + switch (cfg.order[i]) { + case GB_CAT: + builder.append_dict_string(ci++, key.cat); + break; + case GB_FUNC_NAME: + builder.append_dict_string(ci++, key.func_name); + break; + case GB_PID: + builder.append_int64( + ci++, static_cast(key.pid)); + break; + case GB_TID: + builder.append_int64( + ci++, static_cast(key.tid)); + break; + case GB_FILE_HASH: + builder.append_dict_string(ci++, key.file_hash); + break; + case GB_HOST_HASH: + builder.append_dict_string(ci++, key.host_hash); + break; + case GB_FILE_NAME: + builder.append_dict_string(ci++, key.file_name); + break; + case GB_HOST_NAME: + builder.append_dict_string(ci++, key.host_name); + break; + case GB_PROC_NAME: + builder.append_dict_string(ci++, key.proc_name); + break; + case GB_IO_CAT: + builder.append_int64(ci++, key.io_cat); + break; + case GB_ACC_PAT: + builder.append_int64(ci++, key.acc_pat); + break; + case GB_TIME_RANGE: + builder.append_int64(ci++, key.time_range); + break; + } + } + builder.append_int64(ci++, static_cast(m.count)); + builder.append_double(ci++, m.time_sum); + if (m.has_size) { + builder.append_int64(ci++, + static_cast(m.size_sum)); + } else { + builder.append_null(ci++); + } + builder.append_double(ci++, m.time_sq_sum); + if (m.has_size) { + builder.append_double(ci++, m.size_sq_sum); + } else { + builder.append_null(ci++); + } + builder.append_double(ci++, m.count > 0 ? m.time_min_val : 0.0); + builder.append_double(ci++, m.count > 0 ? m.time_max_val : 0.0); + if (m.has_size) { + builder.append_int64( + ci++, static_cast(m.size_min_val)); + builder.append_int64( + ci++, static_cast(m.size_max_val)); + } else { + builder.append_null(ci++); + builder.append_null(ci++); + } + builder.append_double(ci++, + m.count > 0 ? m.time_call_min_val : 0.0); + builder.append_double(ci++, + m.count > 0 ? m.time_call_max_val : 0.0); + if (m.has_size) { + builder.append_int64( + ci++, static_cast(m.size_call_min_val)); + builder.append_int64( + ci++, static_cast(m.size_call_max_val)); + } else { + builder.append_null(ci++); + builder.append_null(ci++); + } + builder.append_int64( + ci++, m.has_time_bounds + ? static_cast(m.time_start_val) + : 0); + builder.append_int64( + ci++, m.has_time_bounds + ? static_cast(m.time_end_val) + : 0); + builder.end_row(); + ++count; + if (static_cast(count) >= input.batch_size) { + flush_builder(builder, count, results); + } + } + flush_builder(builder, count, results); + }; + if (use_events) + flush_coarse(event_coarse, event_builder, event_count, + output.events); + if (use_profiles) + flush_coarse(profile_coarse, profile_builder, profile_count, + output.profiles); + if (use_system) + flush_coarse(system_coarse, system_builder, system_count, + output.system); + } else { + if (use_events) + flush_builder(event_builder, event_count, output.events); + if (use_profiles) + flush_builder(profile_builder, profile_count, output.profiles); + if (use_system) + flush_builder(system_builder, system_count, output.system); + } + + return output; +} + +} // namespace + +static PyObject* Indexer_iter_aggregation(IndexerObject* self, PyObject* args, + PyObject* kwds) { + static const char* kwlist[] = {"type", "batch_size", nullptr}; + const char* type_str = "events"; + Py_ssize_t batch_size = 10000; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|sn", (char**)kwlist, + &type_str, &batch_size)) { + return nullptr; + } + + AggMapType target_type; + if (!parse_agg_type_str(type_str, target_type)) return nullptr; + + AggregationBatchType batch_type; + if (target_type == AggMapType::EVENT) + batch_type = AggregationBatchType::EVENT; + else if (target_type == AggMapType::PROFILE) + batch_type = AggregationBatchType::PROFILE; + else + batch_type = AggregationBatchType::SYSTEM; + + auto idx_opt = resolve_index_path(self); + if (!idx_opt) return nullptr; + std::string index_path = std::move(*idx_opt); + + PyObject* batch_list = PyList_New(0); + if (!batch_list) return nullptr; + + std::string error_msg; + std::vector + results; + + Py_BEGIN_ALLOW_THREADS try { + auto handle = open_agg_db(index_path, error_msg); + if (handle) { + Runtime* rt = get_batch_indexer_runtime(self); + std::vector outputs; + parallel_shard_scan( + rt, + [&](std::uint16_t shard_begin, std::uint16_t shard_end) { + AggScanInput input; + input.agg = handle->agg.get(); + input.target_type = target_type; + input.batch_type = batch_type; + input.batch_size = batch_size; + input.shard_begin = shard_begin; + input.shard_end = shard_end; + return scan_aggregation_shard_range(input); + }, + outputs); + + for (auto& out : outputs) { + for (auto& r : out.results) { + results.push_back(std::move(r)); + } + } + } + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + Py_DECREF(batch_list); + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + append_results_to_list(batch_list, results); + + PyObject* iter = PyObject_GetIter(batch_list); + Py_DECREF(batch_list); + return iter; +} + +static PyObject* Indexer_iter_arrow_dfanalyzer(IndexerObject* self, + PyObject* args, PyObject* kwds) { + static const char* kwlist[] = { + "type", "batch_size", "time_granularity", "time_resolution", + "query", nullptr}; + const char* type_str = "events"; + Py_ssize_t batch_size = 10000; + double time_granularity = 1.0; + double time_resolution = 1000000.0; + const char* query_str = nullptr; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|snddz", (char**)kwlist, + &type_str, &batch_size, &time_granularity, + &time_resolution, &query_str)) { + return nullptr; + } + + AggMapType target_type; + if (!parse_agg_type_str(type_str, target_type)) return nullptr; + + auto query_opt = parse_query_arg(query_str); + if (!query_opt && PyErr_Occurred()) return nullptr; + + auto idx_opt = resolve_index_path(self); + if (!idx_opt) return nullptr; + std::string index_path = std::move(*idx_opt); + + PyObject* batch_list = PyList_New(0); + if (!batch_list) return nullptr; + + std::string error_msg; + std::vector results; + + Py_BEGIN_ALLOW_THREADS try { + auto handle = open_agg_db(index_path, error_msg); + if (handle) { + dftracer::utils::utilities::indexer::IndexDatabase idx_db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto file_hashes = + idx_db.query_hash_table(dftracer::utils::utilities::indexer:: + IndexDatabase::HashType::FILE); + auto host_hashes = + idx_db.query_hash_table(dftracer::utils::utilities::indexer:: + IndexDatabase::HashType::HOST); + + auto time_bounds = handle->agg->query_time_bounds(); + std::uint64_t time_origin = + time_bounds.valid ? time_bounds.min_time_bucket : 0; + + DfanalyzerContext ctx; + ctx.file_hashes = &file_hashes; + ctx.host_hashes = &host_hashes; + ctx.query_filter = query_opt ? &*query_opt : nullptr; + ctx.time_origin = time_origin; + ctx.time_resolution = time_resolution; + ctx.time_granularity = time_granularity; + + Runtime* rt = get_batch_indexer_runtime(self); + std::vector outputs; + parallel_shard_scan( + rt, + [&](std::uint16_t shard_begin, std::uint16_t shard_end) { + DfanalyzerScanInput input; + input.agg = handle->agg.get(); + input.ctx = &ctx; + input.type_filter = target_type; + input.batch_size = batch_size; + input.shard_begin = shard_begin; + input.shard_end = shard_end; + return scan_dfanalyzer_shards(input); + }, + outputs); + + for (auto& out : outputs) { + for (auto& r : out.events) results.push_back(std::move(r)); + for (auto& r : out.profiles) results.push_back(std::move(r)); + for (auto& r : out.system) results.push_back(std::move(r)); + } + } + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + Py_DECREF(batch_list); + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + append_results_to_list(batch_list, results); + + PyObject* iter = PyObject_GetIter(batch_list); + Py_DECREF(batch_list); + return iter; +} + +static bool parse_group_by_arg(PyObject* obj, GroupByConfig& out) { + if (!obj || obj == Py_None) return true; + if (!PySequence_Check(obj)) { + PyErr_SetString(PyExc_TypeError, + "group_by must be a sequence of strings or None"); + return false; + } + Py_ssize_t n = PySequence_Length(obj); + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject* item = PySequence_GetItem(obj, i); + if (!item) return false; + if (!PyUnicode_Check(item)) { + Py_DECREF(item); + PyErr_SetString(PyExc_TypeError, + "group_by entries must be strings"); + return false; + } + Py_ssize_t sz = 0; + const char* s = PyUnicode_AsUTF8AndSize(item, &sz); + if (!s) { + Py_DECREF(item); + return false; + } + std::string_view sv(s, static_cast(sz)); + auto field = parse_group_by_name(sv); + if (!field) { + std::string msg = "unsupported group_by field: "; + msg.append(sv); + Py_DECREF(item); + PyErr_SetString(PyExc_ValueError, msg.c_str()); + return false; + } + if (!(out.mask & *field)) { + out.mask |= *field; + out.order.push_back(*field); + out.names.emplace_back(sv); + } + Py_DECREF(item); + } + return true; +} + +static PyObject* Indexer_iter_arrow_dfanalyzer_all(IndexerObject* self, + PyObject* args, + PyObject* kwds) { + static const char* kwlist[] = {"batch_size", "time_granularity", + "time_resolution", "query", + "group_by", nullptr}; + Py_ssize_t batch_size = 10000; + double time_granularity = 1.0; + double time_resolution = 1000000.0; + const char* query_str = nullptr; + PyObject* group_by_obj = nullptr; + + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "|nddzO", (char**)kwlist, &batch_size, + &time_granularity, &time_resolution, &query_str, &group_by_obj)) { + return nullptr; + } + + auto query_opt = parse_query_arg(query_str); + if (!query_opt && PyErr_Occurred()) return nullptr; + + GroupByConfig group_by_cfg; + if (!parse_group_by_arg(group_by_obj, group_by_cfg)) return nullptr; + const GroupByConfig* group_by_ptr = + group_by_cfg.mask != 0 ? &group_by_cfg : nullptr; + + auto idx_opt = resolve_index_path(self); + if (!idx_opt) return nullptr; + std::string index_path = std::move(*idx_opt); + + PyObject* result_dict = PyDict_New(); + if (!result_dict) return nullptr; + + PyObject* events_list = PyList_New(0); + PyObject* profiles_list = PyList_New(0); + PyObject* system_list = PyList_New(0); + if (!events_list || !profiles_list || !system_list) { + Py_XDECREF(events_list); + Py_XDECREF(profiles_list); + Py_XDECREF(system_list); + Py_DECREF(result_dict); + return nullptr; + } + + std::string error_msg; + std::vector events_results, profiles_results, + system_results; + + Py_BEGIN_ALLOW_THREADS try { + auto handle = open_agg_db(index_path, error_msg); + if (handle) { + dftracer::utils::utilities::indexer::IndexDatabase idx_db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto file_hashes = + idx_db.query_hash_table(dftracer::utils::utilities::indexer:: + IndexDatabase::HashType::FILE); + auto host_hashes = + idx_db.query_hash_table(dftracer::utils::utilities::indexer:: + IndexDatabase::HashType::HOST); + + auto time_bounds = handle->agg->query_time_bounds(); + std::uint64_t time_origin = + time_bounds.valid ? time_bounds.min_time_bucket : 0; + + DfanalyzerContext ctx; + ctx.file_hashes = &file_hashes; + ctx.host_hashes = &host_hashes; + ctx.query_filter = query_opt ? &*query_opt : nullptr; + ctx.time_origin = time_origin; + ctx.time_resolution = time_resolution; + ctx.time_granularity = time_granularity; + + Runtime* rt = get_batch_indexer_runtime(self); + std::vector outputs; + parallel_shard_scan( + rt, + [&](std::uint16_t shard_begin, std::uint16_t shard_end) { + DfanalyzerScanInput input; + input.agg = handle->agg.get(); + input.ctx = &ctx; + input.type_filter = std::nullopt; + input.batch_size = batch_size; + input.shard_begin = shard_begin; + input.shard_end = shard_end; + input.group_by = group_by_ptr; + return scan_dfanalyzer_shards(input); + }, + outputs); + + for (auto& out : outputs) { + for (auto& r : out.events) + events_results.push_back(std::move(r)); + for (auto& r : out.profiles) + profiles_results.push_back(std::move(r)); + for (auto& r : out.system) + system_results.push_back(std::move(r)); + } + } + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + Py_DECREF(events_list); + Py_DECREF(profiles_list); + Py_DECREF(system_list); + Py_DECREF(result_dict); + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + append_results_to_list(events_list, events_results); + append_results_to_list(profiles_list, profiles_results); + append_results_to_list(system_list, system_results); + + PyDict_SetItemString(result_dict, "events", events_list); + PyDict_SetItemString(result_dict, "profiles", profiles_list); + PyDict_SetItemString(result_dict, "system", system_list); + Py_DECREF(events_list); + Py_DECREF(profiles_list); + Py_DECREF(system_list); + + return result_dict; +} + +// --------------------------------------------------------------------------- +// scan_aggregation_manifest — module-level entry point for analyze_trace. +// +// Each Dask worker calls this with its slice of the agg manifest +// (agg_ssts + sys_ssts) and optionally a [shard_begin, shard_end) range. +// The function opens a scratch IndexDatabase at `scratch_dir`, ingests the +// SSTs into its AGGREGATION/SYSTEM_METRICS CFs (nearly free when SSTs live +// on the same filesystem as `scratch_dir` — RocksDB hard-links them), then +// runs the same parallel shard scan that `iter_arrow_dfanalyzer_all` uses. +// +// AGG_GLOBAL_CONFIG_KEY is not written by worker SSTs, so we construct the +// EventAggregator with config_hash=0 directly instead of going through +// `open_agg_db` (which requires the config key). The config hash is used +// by the aggregator only for write-time validation, not for reads. +// +// The scratch DB is NOT cleaned up here — the Python caller owns +// `scratch_dir` lifetime and should remove it after gathering results. +// --------------------------------------------------------------------------- + +static bool collect_string_list(PyObject* obj, const char* name, + std::vector& out) { + if (!obj || obj == Py_None) return true; + PyObject* seq = PySequence_Fast(obj, name); + if (!seq) return false; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + out.reserve(static_cast(n)); + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject* item = PySequence_Fast_GET_ITEM(seq, i); + if (!PyUnicode_Check(item)) { + Py_DECREF(seq); + PyErr_Format(PyExc_TypeError, "%s items must be str", name); + return false; + } + const char* s = PyUnicode_AsUTF8(item); + if (!s) { + Py_DECREF(seq); + return false; + } + out.emplace_back(s); + } + Py_DECREF(seq); + return true; +} + +static bool collect_string_string_dict( + PyObject* obj, const char* name, + std::unordered_map& out) { + if (!obj || obj == Py_None) return true; + if (!PyDict_Check(obj)) { + PyErr_Format(PyExc_TypeError, "%s must be a dict[str, str] or None", + name); + return false; + } + PyObject *k, *v; + Py_ssize_t pos = 0; + while (PyDict_Next(obj, &pos, &k, &v)) { + if (!PyUnicode_Check(k) || !PyUnicode_Check(v)) { + PyErr_Format(PyExc_TypeError, "%s must map str -> str", name); + return false; + } + const char* ks = PyUnicode_AsUTF8(k); + const char* vs = PyUnicode_AsUTF8(v); + if (!ks || !vs) return false; + out.emplace(ks, vs); + } + return true; +} + +static PyObject* scan_aggregation_manifest_fn(PyObject* /*self*/, + PyObject* args, PyObject* kwds) { + static const char* kwlist[] = { + "agg_ssts", "sys_ssts", "scratch_dir", + "meta_index_path", "batch_size", "time_granularity", + "time_resolution", "query", "group_by", + "shard_begin", "shard_end", "runtime", + "file_hashes", "host_hashes", nullptr}; + + PyObject* agg_ssts_obj = nullptr; + PyObject* sys_ssts_obj = nullptr; + const char* scratch_dir = nullptr; + const char* meta_index_path = nullptr; + Py_ssize_t batch_size = 10000; + double time_granularity = 1.0; + double time_resolution = 1000000.0; + const char* query_str = nullptr; + PyObject* group_by_obj = nullptr; + int shard_begin_i = 0; + int shard_end_i = DFT_NUM_SHARDS; + PyObject* runtime_obj = nullptr; + PyObject* file_hashes_obj = nullptr; + PyObject* host_hashes_obj = nullptr; + + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "OOss|nddzOiiOOO", (char**)kwlist, &agg_ssts_obj, + &sys_ssts_obj, &scratch_dir, &meta_index_path, &batch_size, + &time_granularity, &time_resolution, &query_str, &group_by_obj, + &shard_begin_i, &shard_end_i, &runtime_obj, &file_hashes_obj, + &host_hashes_obj)) { + return nullptr; + } + + if (shard_begin_i < 0 || shard_end_i > DFT_NUM_SHARDS || + shard_begin_i >= shard_end_i) { + PyErr_Format(PyExc_ValueError, + "shard range [%d, %d) invalid (must be within [0, %d))", + shard_begin_i, shard_end_i, (int)DFT_NUM_SHARDS); + return nullptr; + } + + std::vector agg_ssts; + std::vector sys_ssts; + if (!collect_string_list(agg_ssts_obj, "agg_ssts", agg_ssts)) + return nullptr; + if (!collect_string_list(sys_ssts_obj, "sys_ssts", sys_ssts)) + return nullptr; + + std::unordered_map preloaded_file_hashes; + std::unordered_map preloaded_host_hashes; + const bool hashes_preloaded = + (file_hashes_obj && file_hashes_obj != Py_None) || + (host_hashes_obj && host_hashes_obj != Py_None); + if (!collect_string_string_dict(file_hashes_obj, "file_hashes", + preloaded_file_hashes)) + return nullptr; + if (!collect_string_string_dict(host_hashes_obj, "host_hashes", + preloaded_host_hashes)) + return nullptr; + + auto query_opt = parse_query_arg(query_str); + if (!query_opt && PyErr_Occurred()) return nullptr; + + GroupByConfig group_by_cfg; + if (!parse_group_by_arg(group_by_obj, group_by_cfg)) return nullptr; + const GroupByConfig* group_by_ptr = + group_by_cfg.mask != 0 ? &group_by_cfg : nullptr; + + Runtime* rt = nullptr; + if (runtime_obj && runtime_obj != Py_None) { + if (!PyObject_TypeCheck(runtime_obj, &RuntimeType)) { + PyErr_SetString(PyExc_TypeError, + "runtime must be a Runtime instance or None"); + return nullptr; + } + rt = ((RuntimeObject*)runtime_obj)->runtime.get(); + } else { + rt = get_default_runtime(); + } + + PyObject* result_dict = PyDict_New(); + if (!result_dict) return nullptr; + PyObject* events_list = PyList_New(0); + PyObject* profiles_list = PyList_New(0); + PyObject* system_list = PyList_New(0); + if (!events_list || !profiles_list || !system_list) { + Py_XDECREF(events_list); + Py_XDECREF(profiles_list); + Py_XDECREF(system_list); + Py_DECREF(result_dict); + return nullptr; + } + + std::string error_msg; + std::vector events_results, profiles_results, + system_results; + std::string scratch_index_path = std::string(scratch_dir) + "/.dftindex"; + std::string meta_index_path_str(meta_index_path); + + Py_BEGIN_ALLOW_THREADS try { + namespace rcf = dftracer::utils::rocksdb::cf; + using clock = std::chrono::steady_clock; + auto ms = [](clock::time_point a, clock::time_point b) -> long long { + return std::chrono::duration_cast(b - a) + .count(); + }; + + auto t_start = clock::now(); + dftracer::utils::utilities::indexer::IndexDatabase scratch_db( + scratch_index_path); + auto t_scratch_open = clock::now(); + + auto raw_db = scratch_db.db(); + for (const auto& p : agg_ssts) { + auto st = raw_db->ingest_external_files(rcf::AGGREGATION, {p}, + /*ingest_behind=*/false); + if (!st.ok()) { + error_msg = + "ingest AGGREGATION sst '" + p + "': " + st.ToString(); + break; + } + } + if (error_msg.empty()) { + for (const auto& p : sys_ssts) { + auto st = raw_db->ingest_external_files( + rcf::SYSTEM_METRICS, {p}, /*ingest_behind=*/false); + if (!st.ok()) { + error_msg = "ingest SYSTEM_METRICS sst '" + p + + "': " + st.ToString(); + break; + } + } + } + auto t_ingest = clock::now(); + + if (error_msg.empty()) { + auto agg = + std::make_unique(raw_db, /*cfg_hash=*/0); + + // If the caller passed pre-loaded hash tables, skip opening + // the meta DB on lustre. When many dask workers run + // scan_aggregation_manifest in parallel, loading the hash + // tables N times from the same file is significant lustre + // metadata pressure; loading once on the coordinator and + // passing them in eliminates the redundant reads. + std::unordered_map loaded_file_hashes; + std::unordered_map loaded_host_hashes; + std::unique_ptr + meta_db; + if (!hashes_preloaded) { + meta_db = std::make_unique< + dftracer::utils::utilities::indexer::IndexDatabase>( + meta_index_path_str, dftracer::utils::rocksdb:: + RocksDatabase::OpenMode::ReadOnly); + loaded_file_hashes = meta_db->query_hash_table( + dftracer::utils::utilities::indexer::IndexDatabase:: + HashType::FILE); + loaded_host_hashes = meta_db->query_hash_table( + dftracer::utils::utilities::indexer::IndexDatabase:: + HashType::HOST); + } + const auto& file_hashes = + hashes_preloaded ? preloaded_file_hashes : loaded_file_hashes; + const auto& host_hashes = + hashes_preloaded ? preloaded_host_hashes : loaded_host_hashes; + auto t_hash_tables = clock::now(); + + auto time_bounds = agg->query_time_bounds(); + std::uint64_t time_origin = + time_bounds.valid ? time_bounds.min_time_bucket : 0; + + DfanalyzerContext ctx; + ctx.file_hashes = &file_hashes; + ctx.host_hashes = &host_hashes; + ctx.query_filter = query_opt ? &*query_opt : nullptr; + ctx.time_origin = time_origin; + ctx.time_resolution = time_resolution; + ctx.time_granularity = time_granularity; + + std::vector outputs; + parallel_shard_scan_range( + rt, static_cast(shard_begin_i), + static_cast(shard_end_i), + [&](std::uint16_t sb, std::uint16_t se) { + DfanalyzerScanInput input; + input.agg = agg.get(); + input.ctx = &ctx; + input.type_filter = std::nullopt; + input.batch_size = batch_size; + input.shard_begin = sb; + input.shard_end = se; + input.group_by = group_by_ptr; + return scan_dfanalyzer_shards(input); + }, + outputs); + auto t_scan = clock::now(); + + for (auto& out : outputs) { + for (auto& r : out.events) + events_results.push_back(std::move(r)); + for (auto& r : out.profiles) + profiles_results.push_back(std::move(r)); + for (auto& r : out.system) + system_results.push_back(std::move(r)); + } + + std::fprintf( + stderr, + "[scan_aggregation_manifest] n_agg=%zu n_sys=%zu " + "scratch_open=%lldms ingest=%lldms hash_tables=%lldms " + "scan=%lldms\n", + agg_ssts.size(), sys_ssts.size(), ms(t_start, t_scratch_open), + ms(t_scratch_open, t_ingest), ms(t_ingest, t_hash_tables), + ms(t_hash_tables, t_scan)); + std::fflush(stderr); + } + } catch (const std::exception& e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + Py_DECREF(events_list); + Py_DECREF(profiles_list); + Py_DECREF(system_list); + Py_DECREF(result_dict); + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return nullptr; + } + + append_results_to_list(events_list, events_results); + append_results_to_list(profiles_list, profiles_results); + append_results_to_list(system_list, system_results); + + PyDict_SetItemString(result_dict, "events", events_list); + PyDict_SetItemString(result_dict, "profiles", profiles_list); + PyDict_SetItemString(result_dict, "system", system_list); + Py_DECREF(events_list); + Py_DECREF(profiles_list); + Py_DECREF(system_list); + + return result_dict; +} + +static PyMethodDef BatchIndexerModuleMethods[] = { + {"scan_aggregation_manifest", (PyCFunction)scan_aggregation_manifest_fn, + METH_VARARGS | METH_KEYWORDS, + "scan_aggregation_manifest(agg_ssts, sys_ssts, scratch_dir, " + "meta_index_path, batch_size=10000, time_granularity=1.0, " + "time_resolution=1e6, query=None, group_by=None, shard_begin=0, " + "shard_end=4096, runtime=None) -> dict\n" + "--\n\n" + "Scan a worker's slice of the distributed aggregation manifest.\n\n" + "Ingests agg_ssts + sys_ssts into a scratch IndexDatabase at " + "scratch_dir (caller owns the directory lifecycle) and runs the " + "dfanalyzer aggregation scan over [shard_begin, shard_end). " + "meta_index_path is the unified .dftindex used to resolve file / " + "host hashes. Returns the same dict shape as " + "Indexer.iter_arrow_dfanalyzer_all."}, + {nullptr, nullptr, 0, nullptr}}; +#endif + +static PyMethodDef Indexer_methods[] = { + {"get_checkpoint_indexer", (PyCFunction)Indexer_get_checkpoint_indexer, + METH_VARARGS, + "get_checkpoint_indexer(file_path)\n" + "--\n\n" + "Get a checkpoint indexer for a specific file.\n\n" + "Args:\n" + " file_path: Path to the trace file (.pfw/.pfw.gz)\n\n" + "Returns:\n" + " Indexer instance for checkpoint-level operations.\n"}, + {"resolve", (PyCFunction)Indexer_resolve, METH_NOARGS, + "resolve()\n" + "--\n\n" + "Check what files exist vs need indexing.\n\n" + "Returns:\n" + " dict with 'total_files', 'ready', 'needs_work', 'index_path'\n"}, + {"build", (PyCFunction)Indexer_build, METH_NOARGS, + "build()\n" + "--\n\n" + "Build all missing index tiers based on require_* flags.\n"}, + {"ensure_indexed", (PyCFunction)Indexer_ensure_indexed, METH_NOARGS, + "ensure_indexed()\n" + "--\n\n" + "Resolve and build if needed.\n\n" + "Returns:\n" + " dict with index status after building.\n"}, + {"get_hash_table", (PyCFunction)Indexer_get_hash_table, METH_VARARGS, + "get_hash_table(type)\n" + "--\n\n" + "Query hash table mappings.\n\n" + "Args:\n" + " type: 'file', 'host', 'string', or 'proc'\n\n" + "Returns:\n" + " dict mapping hash values to resolved names.\n"}, + {"query_file_pids", (PyCFunction)Indexer_query_file_pids, METH_VARARGS, + "query_file_pids(file_id)\n" + "--\n\n" + "Query PIDs observed in a specific file.\n\n" + "Args:\n" + " file_id: Integer file ID from index.\n\n" + "Returns:\n" + " set of PIDs.\n"}, + {"query_all_file_pids", (PyCFunction)Indexer_query_all_file_pids, + METH_NOARGS, + "query_all_file_pids()\n" + "--\n\n" + "Query PIDs for all indexed files.\n\n" + "Returns:\n" + " dict mapping file_id to set of PIDs.\n"}, + {"query_file_info", (PyCFunction)Indexer_query_file_info, METH_NOARGS, + "query_file_info()\n" + "--\n\n" + "Query file ID to path mapping and per-file PIDs in one call.\n\n" + "Returns:\n" + " tuple of (dict[int, str], dict[int, set[int]]).\n"}, +#ifdef DFTRACER_UTILS_ENABLE_ARROW + {"iter_aggregation", (PyCFunction)Indexer_iter_aggregation, + METH_VARARGS | METH_KEYWORDS, + "iter_aggregation(type='events', batch_size=10000)\n" + "--\n\n" + "Iterate over aggregation data as Arrow batches.\n\n" + "Args:\n" + " type: 'events', 'profiles', or 'system'\n" + " batch_size: Number of entries per batch (default 10000)\n\n" + "Returns:\n" + " Iterator over Arrow batch capsules.\n"}, + {"iter_arrow_dfanalyzer", (PyCFunction)Indexer_iter_arrow_dfanalyzer, + METH_VARARGS | METH_KEYWORDS, + "iter_arrow_dfanalyzer(type='events', batch_size=10000, " + "time_granularity=1.0, time_resolution=1e6, query=None)\n" + "--\n\n" + "Iterate over aggregation data as dfanalyzer-compatible Arrow batches.\n\n" + "Output schema matches dfanalyzer expectations with resolved hashes,\n" + "normalized time_range, and computed columns (proc_name, io_cat).\n\n" + "Args:\n" + " type: 'events', 'profiles', or 'system'\n" + " batch_size: Number of entries per batch (default 10000)\n" + " time_granularity: Bucket width in seconds (default 1.0)\n" + " time_resolution: Microseconds per output time unit (default 1e6)\n" + " query: Optional query filter string (e.g., \"pid == 1234\")\n\n" + "Returns:\n" + " Iterator over Arrow batch capsules.\n"}, + {"iter_arrow_dfanalyzer_all", + (PyCFunction)Indexer_iter_arrow_dfanalyzer_all, + METH_VARARGS | METH_KEYWORDS, + "iter_arrow_dfanalyzer_all(batch_size=10000, time_granularity=1.0, " + "time_resolution=1e6, query=None, group_by=None)\n" + "--\n\n" + "Iterate over all aggregation types in a single scan.\n\n" + "Returns a dict with 'events', 'profiles', 'system' keys, each " + "containing\n" + "a list of Arrow batch capsules. This is ~3x faster than calling\n" + "iter_arrow_dfanalyzer separately for each type.\n\n" + "When group_by is provided, the scan collapses dimensions during " + "aggregation\n" + "and emits a reduced schema containing only the requested columns plus\n" + "aggregated metrics (count, time, size, time_sq, size_sq, time_min,\n" + "time_max, size_min, size_max, time_call_min, time_call_max, " + "size_call_min,\n" + "size_call_max, time_start, time_end). Supported group_by columns: " + "cat,\n" + "func_name, pid, tid, file_hash, host_hash, file_name, host_name, " + "proc_name,\n" + "io_cat, acc_pat, time_range.\n\n" + "Args:\n" + " batch_size: Number of entries per batch (default 10000)\n" + " time_granularity: Bucket width in seconds (default 1.0)\n" + " time_resolution: Microseconds per output time unit (default 1e6)\n" + " query: Optional query filter string\n" + " group_by: Optional list of columns to group by; enables coarse\n" + " in-scan aggregation (default None = full granularity)\n\n" + "Returns:\n" + " dict with 'events', 'profiles', 'system' lists of Arrow capsules.\n"}, +#endif + {nullptr}}; + +static PyGetSetDef Indexer_getsetters[] = {{nullptr}}; + +PyTypeObject IndexerType = { + PyVarObject_HEAD_INIT(nullptr, 0) "dftracer_utils_ext.Indexer", + sizeof(IndexerObject), + 0, + (destructor)Indexer_dealloc, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, + "BatchIndexer(directory='', files=None, index_dir='',\n" + " require_checkpoint=True, require_bloom=True,\n" + " require_manifest=True, require_aggregation=False,\n" + " time_interval_ms=5000.0, group_keys=None,\n" + " custom_metric_fields=None, compute_percentiles=False,\n" + " parallelism=0, force_rebuild=False, runtime=None)\n" + "--\n\n" + "Indexer with tiered index building.\n\n" + "At least one of 'directory' or 'files' must be provided.\n" + "- directory: scan for .pfw/.pfw.gz files\n" + "- files: list of specific file paths\n\n" + "Supports:\n" + "- Tier 1: Checkpoints (require_checkpoint)\n" + "- Tier 2: Bloom filters (require_bloom), Manifests (require_manifest)\n" + "- Tier 3: Aggregation (require_aggregation + config params)\n", + 0, + 0, + 0, + 0, + 0, + 0, + Indexer_methods, + 0, + Indexer_getsetters, + 0, + 0, + 0, + 0, + 0, + (initproc)Indexer_init, + 0, + Indexer_new, +}; + +int init_indexer(PyObject* m) { + if (PyType_Ready(&IndexerType) < 0) return -1; + + Py_INCREF(&IndexerType); + if (PyModule_AddObject(m, "Indexer", (PyObject*)&IndexerType) < 0) { + Py_DECREF(&IndexerType); + return -1; + } + +#ifdef DFTRACER_UTILS_ENABLE_ARROW + if (PyModule_AddFunctions(m, BatchIndexerModuleMethods) < 0) return -1; +#endif + + return 0; +} diff --git a/src/dftracer/utils/python/batch_indexer.h b/src/dftracer/utils/python/batch_indexer.h new file mode 100644 index 00000000..d7dd9aa6 --- /dev/null +++ b/src/dftracer/utils/python/batch_indexer.h @@ -0,0 +1,38 @@ +#ifndef DFTRACER_UTILS_PYTHON_BATCH_INDEXER_H +#define DFTRACER_UTILS_PYTHON_BATCH_INDEXER_H + +#include + +#include +#include + +struct IndexerObject { + PyObject_HEAD + + PyObject* runtime_obj; + PyObject* directory; + PyObject* files; // Python list of file paths or None + PyObject* index_dir; + + // Tier requirements + int require_checkpoint; + int require_bloom; + int require_manifest; + int require_aggregation; + + // Aggregation config (stored for rebuild) + double time_interval_ms; + PyObject* group_keys; // Python list or None + PyObject* custom_metric_fields; // Python list or None + int compute_percentiles; + + std::size_t checkpoint_size; + std::size_t parallelism; + int force_rebuild; +}; + +extern PyTypeObject IndexerType; + +int init_indexer(PyObject* m); + +#endif // DFTRACER_UTILS_PYTHON_BATCH_INDEXER_H diff --git a/src/dftracer/utils/python/dftracer_utils_ext.cpp b/src/dftracer/utils/python/dftracer_utils_ext.cpp index 77d7b528..9ae169a8 100644 --- a/src/dftracer/utils/python/dftracer_utils_ext.cpp +++ b/src/dftracer/utils/python/dftracer_utils_ext.cpp @@ -1,9 +1,14 @@ #define PY_SSIZE_T_CLEAN #include +#include +#include +#include #include #include #include +#include #include +#include #include #include #include @@ -14,11 +19,18 @@ #include #include #include +#ifdef DFTRACER_UTILS_ENABLE_ARROW +#include +#include +#endif +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +#include +#endif static PyModuleDef dftracer_utils_module = { PyModuleDef_HEAD_INIT, "dftracer_utils_ext", /* m_name */ - "DFTracer utils module with indexer, reader, lazy JSON, " + "DFTracer utils module with indexer, reader, " "and utility bindings", /* m_doc */ -1, /* m_size */ NULL, /* m_methods */ @@ -33,11 +45,21 @@ PyMODINIT_FUNC PyInit_dftracer_utils_ext(void) { m = PyModule_Create(&dftracer_utils_module); if (m == NULL) return NULL; if (init_indexer_checkpoint(m) < 0) return NULL; - if (init_json(m) < 0) return NULL; + if (init_checkpoint_indexer(m) < 0) return NULL; if (init_indexer(m) < 0) return NULL; if (init_task_handle(m) < 0) return NULL; if (init_runtime(m) < 0) return NULL; + if (dftracer::utils::python::init_memoryview_batch(m) < 0) return NULL; + if (init_json_dict_value(m) < 0) return NULL; if (init_trace_reader_iterator(m) < 0) return NULL; +#ifdef DFTRACER_UTILS_ENABLE_ARROW + if (dftracer::utils::python::init_arrow_streaming_iterator(m) < 0) + return NULL; + if (init_arrow_batch_stream(m) < 0) return NULL; +#endif +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + if (dftracer::utils::python::init_arrow_parallel_reader(m) < 0) return NULL; +#endif if (init_trace_reader(m) < 0) return NULL; if (init_statistics_query(m) < 0) return NULL; if (init_statistics_aggregator(m) < 0) return NULL; @@ -46,5 +68,7 @@ PyMODINIT_FUNC PyInit_dftracer_utils_ext(void) { if (init_reconstruction_planner(m) < 0) return NULL; if (init_aggregator(m) < 0) return NULL; if (init_comparator(m) < 0) return NULL; + if (init_index_database(m) < 0) return NULL; + if (init_sst_distribution(m) < 0) return NULL; return m; } diff --git a/src/dftracer/utils/python/index_database.cpp b/src/dftracer/utils/python/index_database.cpp new file mode 100644 index 00000000..9f29b18a --- /dev/null +++ b/src/dftracer/utils/python/index_database.cpp @@ -0,0 +1,363 @@ +#include +#include +#include +#include + +#include +#include +#include +#include + +using dftracer::utils::utilities::indexer::IndexDatabase; +using dftracer::utils::utilities::indexer::SstArtifactRegistry; + +static void IndexDatabase_dealloc(IndexDatabaseObject *self) { + self->db.~shared_ptr(); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject *IndexDatabase_new(PyTypeObject *type, PyObject * /*args*/, + PyObject * /*kwds*/) { + auto *self = (IndexDatabaseObject *)type->tp_alloc(type, 0); + if (!self) return NULL; + new (&self->db) std::shared_ptr(); + return (PyObject *)self; +} + +static int IndexDatabase_init(IndexDatabaseObject *self, PyObject *args, + PyObject *kwds) { + static const char *kwlist[] = {"index_path", NULL}; + const char *index_path; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "s", (char **)kwlist, + &index_path)) { + return -1; + } + try { + self->db = std::make_shared(index_path); + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return -1; + } + return 0; +} + +static PyObject *IndexDatabase_init_schema(IndexDatabaseObject *self, + PyObject * /*ignored*/) { + if (!self->db) { + PyErr_SetString(PyExc_RuntimeError, "IndexDatabase not initialised"); + return NULL; + } + try { + Py_BEGIN_ALLOW_THREADS self->db->init_schema(); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject *IndexDatabase_register_files(IndexDatabaseObject *self, + PyObject *args, PyObject *kwds) { + static const char *kwlist[] = {"paths", "build_manifest", NULL}; + PyObject *paths_obj; + int build_manifest = 0; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|p", (char **)kwlist, + &paths_obj, &build_manifest)) { + return NULL; + } + std::vector paths; + PyObject *seq = PySequence_Fast(paths_obj, "paths must be a sequence"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + paths.reserve(n); + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject *item = PySequence_Fast_GET_ITEM(seq, i); + const char *s = PyUnicode_AsUTF8(item); + if (!s) { + Py_DECREF(seq); + return NULL; + } + paths.emplace_back(s); + } + Py_DECREF(seq); + + std::vector ids; + try { + Py_BEGIN_ALLOW_THREADS ids = + self->db->register_files(paths, build_manifest != 0); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + + PyObject *out = PyList_New(static_cast(ids.size())); + if (!out) return NULL; + for (Py_ssize_t i = 0; i < static_cast(ids.size()); ++i) { + PyList_SET_ITEM(out, i, PyLong_FromLong(ids[i])); + } + return out; +} + +static PyObject *IndexDatabase_reserve_file_id_range(IndexDatabaseObject *self, + PyObject *args) { + Py_ssize_t count; + if (!PyArg_ParseTuple(args, "n", &count)) return NULL; + if (count < 0) { + PyErr_SetString(PyExc_ValueError, "count must be >= 0"); + return NULL; + } + int first; + try { + Py_BEGIN_ALLOW_THREADS first = + self->db->reserve_file_id_range(static_cast(count)); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + return PyLong_FromLong(first); +} + +static PyObject *IndexDatabase_bulk_ingest(IndexDatabaseObject *self, + PyObject *args, PyObject *kwds) { + static const char *kwlist[] = {"registry", "skip_cfs", NULL}; + PyObject *registry_obj; + PyObject *skip_cfs_obj = NULL; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", (char **)kwlist, + ®istry_obj, &skip_cfs_obj)) { + return NULL; + } + + SstArtifactRegistry *registry = sst_artifact_registry_get(registry_obj); + if (!registry) { + PyErr_SetString(PyExc_TypeError, + "expected an SstArtifactRegistry instance"); + return NULL; + } + + std::unordered_set skip_cfs; + if (skip_cfs_obj && skip_cfs_obj != Py_None) { + PyObject *seq = + PySequence_Fast(skip_cfs_obj, "skip_cfs must be an iterable"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject *item = PySequence_Fast_GET_ITEM(seq, i); + const char *s = PyUnicode_AsUTF8(item); + if (!s) { + Py_DECREF(seq); + return NULL; + } + skip_cfs.emplace(s); + } + Py_DECREF(seq); + } + + try { + Py_BEGIN_ALLOW_THREADS self->db->bulk_ingest(*registry, skip_cfs); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject *IndexDatabase_write_agg_file_markers(IndexDatabaseObject *self, + PyObject *args) { + PyObject *ids_obj; + if (!PyArg_ParseTuple(args, "O", &ids_obj)) return NULL; + + PyObject *seq = PySequence_Fast(ids_obj, "file_ids must be an iterable"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + std::vector file_ids; + file_ids.reserve(static_cast(n)); + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject *item = PySequence_Fast_GET_ITEM(seq, i); + long v = PyLong_AsLong(item); + if (v == -1 && PyErr_Occurred()) { + Py_DECREF(seq); + return NULL; + } + file_ids.push_back(static_cast(v)); + } + Py_DECREF(seq); + + try { + Py_BEGIN_ALLOW_THREADS self->db->write_agg_file_markers(file_ids); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject *IndexDatabase_write_agg_global_config( + IndexDatabaseObject *self, PyObject *args, PyObject *kwds) { + static const char *kwlist[] = {"time_interval_us", "config_hash", NULL}; + unsigned long long time_interval_us = 0; + unsigned int config_hash = 0; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "K|I", (char **)kwlist, + &time_interval_us, &config_hash)) { + return NULL; + } + try { + Py_BEGIN_ALLOW_THREADS self->db->write_agg_global_config( + static_cast(time_interval_us), + static_cast(config_hash)); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject *IndexDatabase_write_aggregation_tracker( + IndexDatabaseObject *self, PyObject *args) { + PyObject *blobs_obj; + if (!PyArg_ParseTuple(args, "O", &blobs_obj)) return NULL; + PyObject *seq = PySequence_Fast(blobs_obj, "blobs must be an iterable"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + std::vector blobs; + blobs.reserve(static_cast(n)); + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject *item = PySequence_Fast_GET_ITEM(seq, i); + if (item == Py_None) continue; + char *buf = nullptr; + Py_ssize_t len = 0; + if (PyBytes_Check(item)) { + if (PyBytes_AsStringAndSize(item, &buf, &len) < 0) { + Py_DECREF(seq); + return NULL; + } + } else { + Py_DECREF(seq); + PyErr_SetString(PyExc_TypeError, + "blobs entries must be bytes or None"); + return NULL; + } + if (len > 0) blobs.emplace_back(buf, static_cast(len)); + } + Py_DECREF(seq); + try { + Py_BEGIN_ALLOW_THREADS self->db->write_aggregation_tracker(blobs); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject *IndexDatabase_rebuild_root_summaries(IndexDatabaseObject *self, + PyObject * /*ignored*/) { + try { + Py_BEGIN_ALLOW_THREADS self->db->rebuild_root_summaries(); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + Py_RETURN_NONE; +} + +static PyMethodDef IndexDatabase_methods[] = { + {"init_schema", (PyCFunction)IndexDatabase_init_schema, METH_NOARGS, + "Idempotently initialise the schema version key."}, + {"register_files", (PyCFunction)IndexDatabase_register_files, + METH_VARARGS | METH_KEYWORDS, + "register_files(paths, build_manifest=False) -> list[int]\n" + "Register each path in the DEFAULT-CF file registry and return the " + "assigned file_ids. Idempotent for files with matching hash."}, + {"reserve_file_id_range", (PyCFunction)IndexDatabase_reserve_file_id_range, + METH_VARARGS, + "reserve_file_id_range(count) -> int\n" + "Atomically reserve `count` contiguous file_ids, return the first."}, + {"bulk_ingest", (PyCFunction)IndexDatabase_bulk_ingest, + METH_VARARGS | METH_KEYWORDS, + "bulk_ingest(registry, skip_cfs=None) -> None\n" + "Ingest all SSTs collected in the SstArtifactRegistry.\n" + "skip_cfs is an optional iterable of CF names whose SSTs are left " + "outside the unified DB (used by distributed builds to keep " + "AGGREGATION/SYSTEM_METRICS SSTs addressable by manifest)."}, + {"rebuild_root_summaries", + (PyCFunction)IndexDatabase_rebuild_root_summaries, METH_NOARGS, + "Recompute ROOT_* summary column families from per-file CFs."}, + {"write_agg_global_config", + (PyCFunction)IndexDatabase_write_agg_global_config, + METH_VARARGS | METH_KEYWORDS, + "write_agg_global_config(time_interval_us, config_hash=0) -> None\n" + "Write the AGG_GLOBAL_CONFIG_KEY marker into the AGGREGATION CF. " + "Required for `iter_arrow_dfanalyzer_all` on distributed builds " + "(which never materialise the key via worker SSTs) or " + "post-consolidate indices."}, + {"write_agg_file_markers", + (PyCFunction)IndexDatabase_write_agg_file_markers, METH_VARARGS, + "write_agg_file_markers(file_ids) -> None\n" + "Write per-file aggregation completion markers (\\xFF\\xFF + file_id) " + "into the AGGREGATION CF. Required after distributed_index otherwise " + "`ensure_indexed()` concludes aggregation is incomplete and re-runs " + "the entire build."}, + {"write_aggregation_tracker", + (PyCFunction)IndexDatabase_write_aggregation_tracker, METH_VARARGS, + "write_aggregation_tracker(blobs) -> None\n" + "Merge a list of serialized AssociationTracker bytes and write the " + "result to the AGGREGATION CF under the `__tracker__` key."}, + {NULL}}; + +PyTypeObject IndexDatabaseType = { + PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext.IndexDatabase", + sizeof(IndexDatabaseObject), + 0, + (destructor)IndexDatabase_dealloc, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + Py_TPFLAGS_DEFAULT, + "Handle to a .dftindex RocksDB store.", + 0, + 0, + 0, + 0, + 0, + 0, + IndexDatabase_methods, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + (initproc)IndexDatabase_init, + 0, + IndexDatabase_new, +}; + +int init_index_database(PyObject *m) { + if (PyType_Ready(&IndexDatabaseType) < 0) return -1; + Py_INCREF(&IndexDatabaseType); + if (PyModule_AddObject(m, "IndexDatabase", (PyObject *)&IndexDatabaseType) < + 0) { + Py_DECREF(&IndexDatabaseType); + return -1; + } + return 0; +} diff --git a/src/dftracer/utils/python/index_database.h b/src/dftracer/utils/python/index_database.h new file mode 100644 index 00000000..66c408fb --- /dev/null +++ b/src/dftracer/utils/python/index_database.h @@ -0,0 +1,23 @@ +#ifndef DFTRACER_UTILS_PYTHON_INDEX_DATABASE_H +#define DFTRACER_UTILS_PYTHON_INDEX_DATABASE_H + +#include + +#include + +namespace dftracer::utils::utilities::indexer { +class IndexDatabase; +class SstArtifactRegistry; +} // namespace dftracer::utils::utilities::indexer + +typedef struct { + PyObject_HEAD + std::shared_ptr + db; +} IndexDatabaseObject; + +extern PyTypeObject IndexDatabaseType; + +int init_index_database(PyObject *m); + +#endif // DFTRACER_UTILS_PYTHON_INDEX_DATABASE_H diff --git a/src/dftracer/utils/python/indexer.cpp b/src/dftracer/utils/python/indexer.cpp index 071a6986..80b95390 100644 --- a/src/dftracer/utils/python/indexer.cpp +++ b/src/dftracer/utils/python/indexer.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -9,8 +10,9 @@ #include #include +#include -static void Indexer_dealloc(IndexerObject *self) { +static void CheckpointIndexer_dealloc(CheckpointIndexerObject *self) { if (self->handle) { // The Python wrapper owns only the native indexer handle. The // underlying RocksDB instance remains manager-owned and may continue to @@ -24,7 +26,7 @@ static void Indexer_dealloc(IndexerObject *self) { Py_TYPE(self)->tp_free((PyObject *)self); } -static void Indexer_release_handle(IndexerObject *self) { +static void CheckpointIndexer_release_handle(CheckpointIndexerObject *self) { if (self->handle) { // Releasing the handle drops this wrapper's native indexer state only. // Shared RocksDB lifetime is managed separately by RocksDBManager. @@ -33,10 +35,10 @@ static void Indexer_release_handle(IndexerObject *self) { } } -static PyObject *Indexer_new(PyTypeObject *type, PyObject *args, - PyObject *kwds) { - IndexerObject *self; - self = (IndexerObject *)type->tp_alloc(type, 0); +static PyObject *CheckpointIndexer_new(PyTypeObject *type, PyObject *args, + PyObject *kwds) { + CheckpointIndexerObject *self; + self = (CheckpointIndexerObject *)type->tp_alloc(type, 0); if (self != NULL) { self->handle = NULL; self->gz_path = NULL; @@ -44,18 +46,16 @@ static PyObject *Indexer_new(PyTypeObject *type, PyObject *args, self->checkpoint_size = 0; self->build_bloom = 0; self->build_manifest = 0; - self->index_threshold = - dftracer::utils::constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; self->runtime_obj = NULL; } return (PyObject *)self; } -static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) { +static int CheckpointIndexer_init(CheckpointIndexerObject *self, PyObject *args, + PyObject *kwds) { static const char *kwlist[] = { - "gz_path", "index_path", "checkpoint_size", - "force_rebuild", "build_bloom", "build_manifest", - "index_threshold", "runtime", NULL}; + "gz_path", "index_path", "checkpoint_size", "force_rebuild", + "build_bloom", "build_manifest", "runtime", NULL}; const char *gz_path; const char *index_path = NULL; std::uint64_t checkpoint_size = @@ -63,14 +63,12 @@ static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) { int force_rebuild = 0; int build_bloom = 0; int build_manifest = 0; - std::uint64_t index_threshold = - dftracer::utils::constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; PyObject *runtime_arg = NULL; - if (!PyArg_ParseTupleAndKeywords( - args, kwds, "s|snpppnO", (char **)kwlist, &gz_path, &index_path, - &checkpoint_size, &force_rebuild, &build_bloom, &build_manifest, - &index_threshold, &runtime_arg)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|snpppO", (char **)kwlist, + &gz_path, &index_path, &checkpoint_size, + &force_rebuild, &build_bloom, + &build_manifest, &runtime_arg)) { return -1; } @@ -112,7 +110,6 @@ static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) { self->checkpoint_size = checkpoint_size; self->build_bloom = build_bloom; self->build_manifest = build_manifest; - self->index_threshold = index_threshold; const char *index_path_str = PyUnicode_AsUTF8(self->index_path); if (!index_path_str) { @@ -129,72 +126,97 @@ static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) { return 0; } -static dftracer::utils::Runtime *get_indexer_runtime(IndexerObject *self) { +static dftracer::utils::Runtime *get_indexer_runtime( + CheckpointIndexerObject *self) { if (self->runtime_obj) { return ((RuntimeObject *)self->runtime_obj)->runtime.get(); } return get_default_runtime(); } -static PyObject *Indexer_build(IndexerObject *self, - PyObject *Py_UNUSED(ignored)) { +static PyObject *CheckpointIndexer_build(CheckpointIndexerObject *self, + PyObject *Py_UNUSED(ignored)) { if (!self->handle) { PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized"); return NULL; } - using namespace dftracer::utils; - using namespace dftracer::utils::utilities::indexer; + // Use IndexBatchBuilderUtility when bloom or manifest is requested. + // Otherwise, use the simpler dft_indexer_build which only creates + // checkpoints. + if (self->build_bloom || self->build_manifest) { + using namespace dftracer::utils; + using namespace dftracer::utils::utilities::indexer; - const char *gz = PyUnicode_AsUTF8(self->gz_path); - const char *idx = PyUnicode_AsUTF8(self->index_path); - if (!gz || !idx) { - return NULL; - } + const char *gz = PyUnicode_AsUTF8(self->gz_path); + const char *idx = PyUnicode_AsUTF8(self->index_path); + if (!gz || !idx) { + return NULL; + } - auto config = IndexBuildConfig::for_file(gz) - .with_checkpoint_size( - static_cast(self->checkpoint_size)) - .with_bloom(self->build_bloom != 0) - .with_manifest(self->build_manifest != 0) - .with_index_threshold( - static_cast(self->index_threshold)); - - std::string idx_str(idx); - auto pos = idx_str.find_last_of('/'); - if (pos != std::string::npos) { - config.with_index_dir(idx_str.substr(0, pos)); - } + auto batch_config = std::make_shared(); + batch_config->file_paths.emplace_back(gz); + batch_config->checkpoint_size = + static_cast(self->checkpoint_size); + batch_config->build_manifest = self->build_manifest != 0; + batch_config->parallelism = 1; + batch_config->use_batch_write = true; + batch_config->rebuild_root_summaries = true; + + std::string idx_str(idx); + auto pos = idx_str.find_last_of('/'); + if (pos != std::string::npos) { + batch_config->index_dir = idx_str.substr(0, pos); + } - Runtime *rt = get_indexer_runtime(self); - IndexBuildResult build_result; + Runtime *rt = get_indexer_runtime(self); + IndexBuildBatchResult batch_result; + + try { + Py_BEGIN_ALLOW_THREADS rt + ->submit( + run_coro_scope( + rt->executor(), + [](CoroScope &scope, + std::shared_ptr cfg, + IndexBuildBatchResult *out) -> coro::CoroTask { + *out = co_await IndexBatchBuilderUtility::process( + &scope, std::move(cfg)); + }, + batch_config, &batch_result), + "indexer-build") + .get(); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } - try { - auto build_coro = - [](IndexBuildConfig cfg) -> coro::CoroTask { - IndexBuilderUtility builder; - co_return co_await builder.process(cfg); - }; - - Py_BEGIN_ALLOW_THREADS auto handle = - rt->submit(build_coro(config), "indexer-build"); - build_result = handle.get(); + if (batch_result.failed > 0 && !batch_result.results.empty()) { + const auto &result = batch_result.results[0]; + if (!result.success) { + PyErr_SetString(PyExc_RuntimeError, + result.error_message.c_str()); + return NULL; + } + } + } else { + // Simple checkpoint-only build + int result; + Py_BEGIN_ALLOW_THREADS result = dft_indexer_build(self->handle); Py_END_ALLOW_THREADS - } catch (const std::exception &e) { - PyErr_SetString(PyExc_RuntimeError, e.what()); - return NULL; - } - if (!build_result.success) { - PyErr_SetString(PyExc_RuntimeError, build_result.error_message.c_str()); - return NULL; + if (result < 0) { + PyErr_SetString(PyExc_RuntimeError, "Failed to build index"); + return NULL; + } } Py_RETURN_NONE; } -static PyObject *Indexer_need_rebuild(IndexerObject *self, - PyObject *Py_UNUSED(ignored)) { +static PyObject *CheckpointIndexer_need_rebuild(CheckpointIndexerObject *self, + PyObject *Py_UNUSED(ignored)) { if (!self->handle) { PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized"); return NULL; @@ -204,8 +226,8 @@ static PyObject *Indexer_need_rebuild(IndexerObject *self, return PyBool_FromLong(result); } -static PyObject *Indexer_exists(IndexerObject *self, - PyObject *Py_UNUSED(ignored)) { +static PyObject *CheckpointIndexer_exists(CheckpointIndexerObject *self, + PyObject *Py_UNUSED(ignored)) { if (!self->handle) { PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized"); return NULL; @@ -215,8 +237,8 @@ static PyObject *Indexer_exists(IndexerObject *self, return PyBool_FromLong(result); } -static PyObject *Indexer_get_max_bytes(IndexerObject *self, - PyObject *Py_UNUSED(ignored)) { +static PyObject *CheckpointIndexer_get_max_bytes(CheckpointIndexerObject *self, + PyObject *Py_UNUSED(ignored)) { if (!self->handle) { PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized"); return NULL; @@ -226,8 +248,8 @@ static PyObject *Indexer_get_max_bytes(IndexerObject *self, return PyLong_FromUnsignedLongLong(result); } -static PyObject *Indexer_get_num_lines(IndexerObject *self, - PyObject *Py_UNUSED(ignored)) { +static PyObject *CheckpointIndexer_get_num_lines(CheckpointIndexerObject *self, + PyObject *Py_UNUSED(ignored)) { if (!self->handle) { PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized"); return NULL; @@ -237,7 +259,8 @@ static PyObject *Indexer_get_num_lines(IndexerObject *self, return PyLong_FromUnsignedLongLong(result); } -static PyObject *Indexer_find_checkpoint(IndexerObject *self, PyObject *args) { +static PyObject *CheckpointIndexer_find_checkpoint( + CheckpointIndexerObject *self, PyObject *args) { if (!self->handle) { PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized"); return NULL; @@ -268,8 +291,8 @@ static PyObject *Indexer_find_checkpoint(IndexerObject *self, PyObject *args) { return (PyObject *)cp_obj; } -static PyObject *Indexer_get_checkpoints(IndexerObject *self, - PyObject *Py_UNUSED(ignored)) { +static PyObject *CheckpointIndexer_get_checkpoints( + CheckpointIndexerObject *self, PyObject *Py_UNUSED(ignored)) { if (!self->handle) { PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized"); return NULL; @@ -309,7 +332,8 @@ static PyObject *Indexer_get_checkpoints(IndexerObject *self, return list; } -static PyObject *Indexer_has_bloom(IndexerObject *self, void *closure) { +static PyObject *CheckpointIndexer_has_bloom(CheckpointIndexerObject *self, + void *closure) { const char *idx = PyUnicode_AsUTF8(self->index_path); const char *gz = PyUnicode_AsUTF8(self->gz_path); if (!idx || !gz) { @@ -330,7 +354,8 @@ static PyObject *Indexer_has_bloom(IndexerObject *self, void *closure) { Py_RETURN_FALSE; } -static PyObject *Indexer_has_manifest(IndexerObject *self, void *closure) { +static PyObject *CheckpointIndexer_has_manifest(CheckpointIndexerObject *self, + void *closure) { const char *idx = PyUnicode_AsUTF8(self->index_path); const char *gz = PyUnicode_AsUTF8(self->gz_path); if (!idx || !gz) { @@ -351,66 +376,71 @@ static PyObject *Indexer_has_manifest(IndexerObject *self, void *closure) { Py_RETURN_FALSE; } -static PyObject *Indexer_gz_path(IndexerObject *self, void *closure) { +static PyObject *CheckpointIndexer_gz_path(CheckpointIndexerObject *self, + void *closure) { Py_INCREF(self->gz_path); return self->gz_path; } -static PyObject *Indexer_index_path(IndexerObject *self, void *closure) { +static PyObject *CheckpointIndexer_index_path(CheckpointIndexerObject *self, + void *closure) { Py_INCREF(self->index_path); return self->index_path; } -static PyObject *Indexer_checkpoint_size(IndexerObject *self, void *closure) { +static PyObject *CheckpointIndexer_checkpoint_size( + CheckpointIndexerObject *self, void *closure) { return PyLong_FromUnsignedLongLong(self->checkpoint_size); } -static PyObject *Indexer_enter(IndexerObject *self, - PyObject *Py_UNUSED(ignored)) { +static PyObject *CheckpointIndexer_enter(CheckpointIndexerObject *self, + PyObject *Py_UNUSED(ignored)) { Py_INCREF(self); return (PyObject *)self; } -static PyObject *Indexer_close(IndexerObject *self, - PyObject *Py_UNUSED(ignored)) { - Indexer_release_handle(self); +static PyObject *CheckpointIndexer_close(CheckpointIndexerObject *self, + PyObject *Py_UNUSED(ignored)) { + CheckpointIndexer_release_handle(self); Py_RETURN_NONE; } -static PyObject *Indexer_exit(IndexerObject *self, PyObject *args) { - Indexer_release_handle(self); +static PyObject *CheckpointIndexer_exit(CheckpointIndexerObject *self, + PyObject *args) { + CheckpointIndexer_release_handle(self); Py_RETURN_NONE; } -static PyMethodDef Indexer_methods[] = { - {"build", (PyCFunction)Indexer_build, METH_NOARGS, +static PyMethodDef CheckpointIndexer_methods[] = { + {"build", (PyCFunction)CheckpointIndexer_build, METH_NOARGS, "build()\n" "--\n" "\n" "Build or rebuild the index.\n"}, - {"need_rebuild", (PyCFunction)Indexer_need_rebuild, METH_NOARGS, + {"need_rebuild", (PyCFunction)CheckpointIndexer_need_rebuild, METH_NOARGS, "Check if a rebuild is needed."}, - {"exists", (PyCFunction)Indexer_exists, METH_NOARGS, + {"exists", (PyCFunction)CheckpointIndexer_exists, METH_NOARGS, "Check if the .dftindex store exists."}, - {"get_max_bytes", (PyCFunction)Indexer_get_max_bytes, METH_NOARGS, + {"get_max_bytes", (PyCFunction)CheckpointIndexer_get_max_bytes, METH_NOARGS, "Get the maximum uncompressed bytes in the indexed file."}, - {"get_num_lines", (PyCFunction)Indexer_get_num_lines, METH_NOARGS, + {"get_num_lines", (PyCFunction)CheckpointIndexer_get_num_lines, METH_NOARGS, "Get the total number of lines in the indexed file."}, - {"find_checkpoint", (PyCFunction)Indexer_find_checkpoint, METH_VARARGS, + {"find_checkpoint", (PyCFunction)CheckpointIndexer_find_checkpoint, + METH_VARARGS, "Find the best checkpoint for a given uncompressed offset.\n" "\n" "Args:\n" " offset (int): Uncompressed byte offset.\n"}, - {"get_checkpoints", (PyCFunction)Indexer_get_checkpoints, METH_NOARGS, - "Get all checkpoints for this file as a list."}, - {"close", (PyCFunction)Indexer_close, METH_NOARGS, + {"get_checkpoints", (PyCFunction)CheckpointIndexer_get_checkpoints, + METH_NOARGS, "Get all checkpoints for this file as a list."}, + {"close", (PyCFunction)CheckpointIndexer_close, METH_NOARGS, "Release this Python wrapper's native indexer handle.\n" "\n" "The shared RocksDB instance for the same .dftindex path remains managed\n" "by the native RocksDBManager cache."}, - {"__enter__", (PyCFunction)Indexer_enter, METH_NOARGS, + {"__enter__", (PyCFunction)CheckpointIndexer_enter, METH_NOARGS, "Enter the runtime context for the with statement."}, - {"__exit__", (PyCFunction)Indexer_exit, METH_VARARGS, + {"__exit__", (PyCFunction)CheckpointIndexer_exit, METH_VARARGS, "Release this Python wrapper on context exit.\n" "\n" "This does not force-close the shared RocksDB instance for the same\n" @@ -418,48 +448,48 @@ static PyMethodDef Indexer_methods[] = { {NULL} /* Sentinel */ }; -static PyGetSetDef Indexer_getsetters[] = { - {"gz_path", (getter)Indexer_gz_path, NULL, "Path to the gzip file", NULL}, - {"index_path", (getter)Indexer_index_path, NULL, +static PyGetSetDef CheckpointIndexer_getsetters[] = { + {"gz_path", (getter)CheckpointIndexer_gz_path, NULL, + "Path to the gzip file", NULL}, + {"index_path", (getter)CheckpointIndexer_index_path, NULL, "Path to the .dftindex store", NULL}, - {"checkpoint_size", (getter)Indexer_checkpoint_size, NULL, + {"checkpoint_size", (getter)CheckpointIndexer_checkpoint_size, NULL, "Checkpoint size in bytes", NULL}, - {"has_bloom", (getter)Indexer_has_bloom, NULL, + {"has_bloom", (getter)CheckpointIndexer_has_bloom, NULL, "Whether bloom data exists in index", NULL}, - {"has_manifest", (getter)Indexer_has_manifest, NULL, + {"has_manifest", (getter)CheckpointIndexer_has_manifest, NULL, "Whether manifest data exists in index", NULL}, {NULL} /* Sentinel */ }; -PyTypeObject IndexerType = { - PyVarObject_HEAD_INIT(NULL, 0) "indexer.Indexer", /* tp_name */ - sizeof(IndexerObject), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor)Indexer_dealloc, /* tp_dealloc */ - 0, /* tp_vectorcall_offset */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_as_async */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ - "Indexer(gz_path: str, index_path: str | None = None,\n" - " checkpoint_size: int = 1048576,\n" - " force_rebuild: bool = False, build_bloom: bool = False,\n" - " build_manifest: bool = False,\n" - " index_threshold: int = 1048576,\n" - " runtime: Runtime | None = None)\n" +PyTypeObject CheckpointIndexerType = { + PyVarObject_HEAD_INIT( + NULL, 0) "dftracer_utils_ext.CheckpointIndexer", /* tp_name */ + sizeof(CheckpointIndexerObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)CheckpointIndexer_dealloc, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_as_async */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "CheckpointIndexer(gz_path, index_path=None, checkpoint_size=1048576, " + "force_rebuild=False, build_bloom=False, build_manifest=False, " + "runtime=None)\n" "--\n" "\n" - "Indexer for creating and managing gzip trace index stores.\n" + "Checkpoint indexer for single-file checkpoint-level operations on a " + "gzip trace.\n" "\n" "Args:\n" " gz_path (str): Path to the gzip trace file.\n" @@ -470,38 +500,37 @@ PyTypeObject IndexerType = { " force_rebuild (bool): If True, rebuild the index even if it\n" " exists.\n" " build_bloom (bool): If True, build bloom filter data in the\n" - " store.\n" + " index.\n" " build_manifest (bool): If True, build manifest data in the\n" " store.\n" - " index_threshold (int): Skip indexing for files smaller than\n" - " this (default 1 MB).\n" " runtime (Runtime or None): Runtime instance for thread pool\n" " control. If None, uses the default global Runtime.\n", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - Indexer_methods, /* tp_methods */ - 0, /* tp_members */ - Indexer_getsetters, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc)Indexer_init, /* tp_init */ - 0, /* tp_alloc */ - Indexer_new, /* tp_new */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + CheckpointIndexer_methods, /* tp_methods */ + 0, /* tp_members */ + CheckpointIndexer_getsetters, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)CheckpointIndexer_init, /* tp_init */ + 0, /* tp_alloc */ + CheckpointIndexer_new, /* tp_new */ }; -int init_indexer(PyObject *m) { - if (PyType_Ready(&IndexerType) < 0) return -1; +int init_checkpoint_indexer(PyObject *m) { + if (PyType_Ready(&CheckpointIndexerType) < 0) return -1; - Py_INCREF(&IndexerType); - if (PyModule_AddObject(m, "Indexer", (PyObject *)&IndexerType) < 0) { - Py_DECREF(&IndexerType); + Py_INCREF(&CheckpointIndexerType); + if (PyModule_AddObject(m, "CheckpointIndexer", + (PyObject *)&CheckpointIndexerType) < 0) { + Py_DECREF(&CheckpointIndexerType); Py_DECREF(m); return -1; } diff --git a/src/dftracer/utils/python/indexer.h b/src/dftracer/utils/python/indexer.h index d31d0ccf..a2dfd203 100644 --- a/src/dftracer/utils/python/indexer.h +++ b/src/dftracer/utils/python/indexer.h @@ -13,12 +13,11 @@ typedef struct { std::uint64_t checkpoint_size; int build_bloom; int build_manifest; - std::uint64_t index_threshold; PyObject *runtime_obj; // RuntimeObject* or NULL (uses default) -} IndexerObject; +} CheckpointIndexerObject; -extern PyTypeObject IndexerType; +extern PyTypeObject CheckpointIndexerType; -int init_indexer(PyObject *m); +int init_checkpoint_indexer(PyObject *m); #endif diff --git a/src/dftracer/utils/python/json.cpp b/src/dftracer/utils/python/json.cpp index 3c827b8a..3f31c602 100644 --- a/src/dftracer/utils/python/json.cpp +++ b/src/dftracer/utils/python/json.cpp @@ -1,727 +1,364 @@ +#define PY_SSIZE_T_CLEAN +#include #include -#include -#include - -static void JSON_dealloc(JSONObject* self) { - if (self->doc && self->owns_doc) { - yyjson_doc_free(self->doc); - } - Py_TYPE(self)->tp_free((PyObject*)self); +using dftracer::utils::utilities::composites::dft::ArgsValueProxy; + +PyObject *args_value_to_pyobject(const ArgsValue &v) { + return std::visit( + [](const auto &val) -> PyObject * { + using T = std::decay_t; + if constexpr (std::is_same_v) { + Py_RETURN_NONE; + } else if constexpr (std::is_same_v) { + return PyUnicode_FromStringAndSize(val.data(), val.size()); + } else if constexpr (std::is_same_v) { + return PyLong_FromUnsignedLongLong(val); + } else if constexpr (std::is_same_v) { + return PyLong_FromLongLong(val); + } else if constexpr (std::is_same_v) { + return PyFloat_FromDouble(val); + } else if constexpr (std::is_same_v) { + return PyBool_FromLong(val ? 1 : 0); + } else { + Py_RETURN_NONE; + } + }, + v); } -static PyObject* JSON_new(PyTypeObject* type, PyObject* args, PyObject* kwds) { - JSONObject* self; - self = (JSONObject*)type->tp_alloc(type, 0); - if (self != NULL) { - self->doc = nullptr; - self->root = nullptr; - self->parsed = false; - self->json_length = 0; - self->owns_doc = false; - } - return (PyObject*)self; +static const ArgsMap &get_map(JsonDictValueObject *self) { + auto &ev = self->batch->events[self->event_index]; + return self->is_args ? ev.args : ev.top; } -static int JSON_init(JSONObject* self, PyObject* args, PyObject* kwds) { - const char* json_str; - if (!PyArg_ParseTuple(args, "s", &json_str)) { - return -1; - } - - self->json_length = strlen(json_str); - if (self->json_length > 0) { - std::memcpy(self->json_data, json_str, self->json_length); - } - self->doc = nullptr; - self->root = nullptr; - self->parsed = false; - self->owns_doc = true; - return 0; +static void JsonDictValue_dealloc(JsonDictValueObject *self) { + self->batch.reset(); + Py_TYPE(self)->tp_free((PyObject *)self); } -static bool JSON_ensure_parsed(JSONObject* self) { - if (self->root != nullptr) { - return true; - } - - if (self->parsed && self->doc != nullptr) { - return true; - } - - if (!self->parsed && self->json_length > 0) { - // Use YYJSON_READ_INSITU for large-scale processing - // (zero-copy, in-place modification) - yyjson_read_err err; - self->doc = yyjson_read_opts(self->json_data, self->json_length, - YYJSON_READ_INSITU, NULL, &err); - if (!self->doc) { - char err_msg[256]; - std::snprintf(err_msg, sizeof(err_msg), - "Failed to parse JSON at position %zu: %s (code %u, " - "string: %.*s)", - err.pos, err.msg, err.code, (int)self->json_length, - self->json_data); - PyErr_SetString(PyExc_ValueError, err_msg); - return false; - } - self->parsed = true; - return true; +static Py_ssize_t JsonDictValue_length(JsonDictValueObject *self) { + const auto &map = get_map(self); + Py_ssize_t count = 0; + map.for_each_member([&](std::string_view, ArgsValueProxy) { ++count; }); + if (!self->is_args && get_map(self).exists()) { + auto &ev = self->batch->events[self->event_index]; + if (ev.args.exists()) ++count; } - - // If we get here, there's no data to parse - return false; + return count; } -// Get the root yyjson_val for this JSON object (handles both top-level and -// subtrees) -static yyjson_val* JSON_get_root(JSONObject* self) { - if (self->root != nullptr) { - // This is a subtree wrapper - return the wrapped value directly - return self->root; - } - // This is a top-level document - get the root from the doc - return yyjson_doc_get_root(self->doc); -} +static PyObject *JsonDictValue_subscript(JsonDictValueObject *self, + PyObject *key) { + const char *key_str = PyUnicode_AsUTF8(key); + if (!key_str) return NULL; -static PyObject* JSON_contains(JSONObject* self, PyObject* key) { - if (!JSON_ensure_parsed(self)) { - return NULL; - } + std::string_view k(key_str); - if (!PyUnicode_Check(key)) { - PyErr_SetString(PyExc_TypeError, "Key must be a string"); + if (!self->is_args && k == "args") { + auto &ev = self->batch->events[self->event_index]; + if (!ev.args.exists()) { + Py_RETURN_NONE; + } + JsonDictValueObject *obj = + (JsonDictValueObject *)JsonDictValueType.tp_alloc( + &JsonDictValueType, 0); + if (!obj) return NULL; + new (&obj->batch) std::shared_ptr(self->batch); + obj->event_index = self->event_index; + obj->is_args = true; + return (PyObject *)obj; + } + + const auto &map = get_map(self); + auto proxy = map[k]; + if (!proxy.exists()) { + PyErr_SetObject(PyExc_KeyError, key); return NULL; } - const char* key_str = PyUnicode_AsUTF8(key); - if (!key_str) { + const auto &raw = map.raw(); + auto it = raw.find(k); + if (it == raw.end()) { + PyErr_SetObject(PyExc_KeyError, key); return NULL; } - - yyjson_val* root = JSON_get_root(self); - if (!yyjson_is_obj(root)) { - Py_RETURN_FALSE; - } - - yyjson_val* val = yyjson_obj_get(root, key_str); - if (val) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } + return args_value_to_pyobject(it->second); } -static int JSON_contains_sq(PyObject* self_obj, PyObject* key) { - JSONObject* self = (JSONObject*)self_obj; - PyObject* result = JSON_contains(self, key); - if (!result) { - return -1; - } - - int is_true = PyObject_IsTrue(result); - Py_DECREF(result); - return is_true; -} - -static PyObject* yyjson_val_to_python(yyjson_val* val) { - if (yyjson_is_null(val)) { - Py_RETURN_NONE; - } else if (yyjson_is_bool(val)) { - if (yyjson_get_bool(val)) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; +static PyObject *JsonDictValue_keys(JsonDictValueObject *self, + PyObject *Py_UNUSED(ignored)) { + PyObject *list = PyList_New(0); + if (!list) return NULL; + + const auto &map = get_map(self); + map.for_each_member([&](std::string_view k, ArgsValueProxy) { + PyObject *key = PyUnicode_FromStringAndSize(k.data(), k.size()); + if (key) { + PyList_Append(list, key); + Py_DECREF(key); } - } else if (yyjson_is_uint(val)) { - return PyLong_FromUnsignedLongLong(yyjson_get_uint(val)); - } else if (yyjson_is_int(val)) { - return PyLong_FromLongLong(yyjson_get_int(val)); - } else if (yyjson_is_real(val)) { - return PyFloat_FromDouble(yyjson_get_real(val)); - } else if (yyjson_is_str(val)) { - return PyUnicode_FromString(yyjson_get_str(val)); - } else if (yyjson_is_arr(val)) { - std::size_t idx, max; - yyjson_val* item; - PyObject* list = PyList_New(0); - if (!list) return NULL; - - yyjson_arr_foreach(val, idx, max, item) { - PyObject* py_item = yyjson_val_to_python(item); - if (!py_item) { - Py_DECREF(list); - return NULL; + }); + + if (!self->is_args) { + auto &ev = self->batch->events[self->event_index]; + if (ev.args.exists()) { + PyObject *args_key = PyUnicode_InternFromString("args"); + if (args_key) { + PyList_Append(list, args_key); + Py_DECREF(args_key); } - if (PyList_Append(list, py_item) < 0) { - Py_DECREF(py_item); - Py_DECREF(list); - return NULL; - } - Py_DECREF(py_item); } - return list; - } else if (yyjson_is_obj(val)) { - std::size_t idx, max; - yyjson_val *key_val, *val_val; - PyObject* dict = PyDict_New(); - if (!dict) return NULL; - - yyjson_obj_foreach(val, idx, max, key_val, val_val) { - const char* key_str = yyjson_get_str(key_val); - PyObject* py_key = PyUnicode_FromString(key_str); - PyObject* py_val = yyjson_val_to_python(val_val); - - if (!py_key || !py_val) { - Py_XDECREF(py_key); - Py_XDECREF(py_val); - Py_DECREF(dict); - return NULL; - } - - if (PyDict_SetItem(dict, py_key, py_val) < 0) { - Py_DECREF(py_key); - Py_DECREF(py_val); - Py_DECREF(dict); - return NULL; - } - - Py_DECREF(py_key); - Py_DECREF(py_val); - } - return dict; - } - - Py_RETURN_NONE; -} - -static PyObject* JSON_getitem(JSONObject* self, PyObject* key) { - if (!JSON_ensure_parsed(self)) { - return NULL; - } - - if (!PyUnicode_Check(key)) { - PyErr_SetString(PyExc_TypeError, "Key must be a string"); - return NULL; - } - - const char* key_str = PyUnicode_AsUTF8(key); - if (!key_str) { - return NULL; - } - - yyjson_val* root = JSON_get_root(self); - if (!yyjson_is_obj(root)) { - PyErr_SetString(PyExc_TypeError, "JSON root is not an object"); - return NULL; - } - - yyjson_val* val = yyjson_obj_get(root, key_str); - if (!val) { - PyErr_SetString(PyExc_KeyError, key_str); - return NULL; - } - - // If the value is an object or array, return a lazy wrapper - if (yyjson_is_obj(val) || yyjson_is_arr(val)) { - return JSON_from_yyjson_val(self->doc, val); } - return yyjson_val_to_python(val); + return list; } -static PyObject* JSON_keys(JSONObject* self, PyObject* Py_UNUSED(ignored)) { - if (!JSON_ensure_parsed(self)) { - return NULL; - } - - yyjson_val* root = JSON_get_root(self); - if (!yyjson_is_obj(root)) { - return PyList_New(0); +static PyObject *JsonDictValue_values(JsonDictValueObject *self, + PyObject *Py_UNUSED(ignored)) { + PyObject *list = PyList_New(0); + if (!list) return NULL; + + const auto &map = get_map(self); + for (const auto &[k, v] : map.raw()) { + PyObject *val = args_value_to_pyobject(v); + if (val) { + PyList_Append(list, val); + Py_DECREF(val); + } } - PyObject* keys = PyList_New(0); - if (!keys) return NULL; - - std::size_t idx, max; - yyjson_val *key_val, *val_val; - yyjson_obj_foreach(root, idx, max, key_val, val_val) { - const char* key_str = yyjson_get_str(key_val); - PyObject* py_key = PyUnicode_FromString(key_str); - if (!py_key) { - Py_DECREF(keys); - return NULL; - } - if (PyList_Append(keys, py_key) < 0) { - Py_DECREF(py_key); - Py_DECREF(keys); - return NULL; + if (!self->is_args) { + auto &ev = self->batch->events[self->event_index]; + if (ev.args.exists()) { + JsonDictValueObject *args_obj = + (JsonDictValueObject *)JsonDictValueType.tp_alloc( + &JsonDictValueType, 0); + if (args_obj) { + new (&args_obj->batch) + std::shared_ptr(self->batch); + args_obj->event_index = self->event_index; + args_obj->is_args = true; + PyList_Append(list, (PyObject *)args_obj); + Py_DECREF(args_obj); + } } - Py_DECREF(py_key); } - return keys; + return list; } -static PyObject* JSON_values(JSONObject* self, PyObject* Py_UNUSED(ignored)) { - if (!JSON_ensure_parsed(self)) { - return NULL; - } - - yyjson_val* root = JSON_get_root(self); - if (!yyjson_is_obj(root)) { - return PyList_New(0); - } - - PyObject* values = PyList_New(0); - if (!values) return NULL; - - std::size_t idx, max; - yyjson_val *key_val, *val_val; - yyjson_obj_foreach(root, idx, max, key_val, val_val) { - PyObject* py_val; - // If the value is an object or array, return a lazy wrapper - if (yyjson_is_obj(val_val) || yyjson_is_arr(val_val)) { - py_val = JSON_from_yyjson_val(self->doc, val_val); - } else { - py_val = yyjson_val_to_python(val_val); - } - - if (!py_val) { - Py_DECREF(values); - return NULL; +static PyObject *JsonDictValue_items(JsonDictValueObject *self, + PyObject *Py_UNUSED(ignored)) { + PyObject *list = PyList_New(0); + if (!list) return NULL; + + const auto &map = get_map(self); + for (const auto &[k, v] : map.raw()) { + PyObject *key = PyUnicode_FromStringAndSize(k.data(), k.size()); + PyObject *val = args_value_to_pyobject(v); + if (key && val) { + PyObject *tuple = PyTuple_Pack(2, key, val); + if (tuple) { + PyList_Append(list, tuple); + Py_DECREF(tuple); + } } - - if (PyList_Append(values, py_val) < 0) { - Py_DECREF(py_val); - Py_DECREF(values); - return NULL; + Py_XDECREF(key); + Py_XDECREF(val); + } + + if (!self->is_args) { + auto &ev = self->batch->events[self->event_index]; + if (ev.args.exists()) { + PyObject *args_key = PyUnicode_InternFromString("args"); + JsonDictValueObject *args_obj = + (JsonDictValueObject *)JsonDictValueType.tp_alloc( + &JsonDictValueType, 0); + if (args_key && args_obj) { + new (&args_obj->batch) + std::shared_ptr(self->batch); + args_obj->event_index = self->event_index; + args_obj->is_args = true; + PyObject *tuple = + PyTuple_Pack(2, args_key, (PyObject *)args_obj); + if (tuple) { + PyList_Append(list, tuple); + Py_DECREF(tuple); + } + } + Py_XDECREF(args_key); + Py_XDECREF((PyObject *)args_obj); } - Py_DECREF(py_val); } - return values; + return list; } -static PyObject* JSON_items(JSONObject* self, PyObject* Py_UNUSED(ignored)) { - if (!JSON_ensure_parsed(self)) { - return NULL; - } - - yyjson_val* root = JSON_get_root(self); - if (!yyjson_is_obj(root)) { - return PyList_New(0); - } - - PyObject* items = PyList_New(0); - if (!items) return NULL; - - std::size_t idx, max; - yyjson_val *key_val, *val_val; - yyjson_obj_foreach(root, idx, max, key_val, val_val) { - const char* key_str = yyjson_get_str(key_val); - PyObject* py_key = PyUnicode_FromString(key_str); - if (!py_key) { - Py_DECREF(items); - return NULL; - } - - PyObject* py_val; - // If the value is an object or array, return a lazy wrapper - if (yyjson_is_obj(val_val) || yyjson_is_arr(val_val)) { - py_val = JSON_from_yyjson_val(self->doc, val_val); - } else { - py_val = yyjson_val_to_python(val_val); - } - - if (!py_val) { - Py_DECREF(py_key); - Py_DECREF(items); - return NULL; - } +static PyObject *JsonDictValue_get(JsonDictValueObject *self, PyObject *args) { + PyObject *key; + PyObject *default_val = Py_None; + if (!PyArg_ParseTuple(args, "O|O", &key, &default_val)) return NULL; - PyObject* tuple = PyTuple_Pack(2, py_key, py_val); - Py_DECREF(py_key); - Py_DECREF(py_val); + const char *key_str = PyUnicode_AsUTF8(key); + if (!key_str) return NULL; - if (!tuple) { - Py_DECREF(items); - return NULL; - } + std::string_view k(key_str); - if (PyList_Append(items, tuple) < 0) { - Py_DECREF(tuple); - Py_DECREF(items); - return NULL; + if (!self->is_args && k == "args") { + auto &ev = self->batch->events[self->event_index]; + if (!ev.args.exists()) { + Py_INCREF(default_val); + return default_val; } - Py_DECREF(tuple); - } - - return items; + JsonDictValueObject *obj = + (JsonDictValueObject *)JsonDictValueType.tp_alloc( + &JsonDictValueType, 0); + if (!obj) return NULL; + new (&obj->batch) std::shared_ptr(self->batch); + obj->event_index = self->event_index; + obj->is_args = true; + return (PyObject *)obj; + } + + const auto &map = get_map(self); + auto it = map.raw().find(k); + if (it == map.raw().end()) { + Py_INCREF(default_val); + return default_val; + } + return args_value_to_pyobject(it->second); } -static PyObject* JSON_get(JSONObject* self, PyObject* args) { - PyObject* key; - PyObject* default_value = Py_None; - - if (!PyArg_ParseTuple(args, "O|O", &key, &default_value)) { - return NULL; - } - - if (!JSON_ensure_parsed(self)) { - return NULL; - } - - if (!PyUnicode_Check(key)) { - PyErr_SetString(PyExc_TypeError, "Key must be a string"); - return NULL; - } - - const char* key_str = PyUnicode_AsUTF8(key); - if (!key_str) { - return NULL; - } - - yyjson_val* root = JSON_get_root(self); - if (!yyjson_is_obj(root)) { - Py_INCREF(default_value); - return default_value; - } +static int JsonDictValue_contains(JsonDictValueObject *self, PyObject *key) { + const char *key_str = PyUnicode_AsUTF8(key); + if (!key_str) return -1; - yyjson_val* val = yyjson_obj_get(root, key_str); - if (!val) { - Py_INCREF(default_value); - return default_value; - } + std::string_view k(key_str); - // If the value is an object or array, return a lazy wrapper - if (yyjson_is_obj(val) || yyjson_is_arr(val)) { - return JSON_from_yyjson_val(self->doc, val); + if (!self->is_args && k == "args") { + auto &ev = self->batch->events[self->event_index]; + return ev.args.exists() ? 1 : 0; } - return yyjson_val_to_python(val); + const auto &map = get_map(self); + return map[k].exists() ? 1 : 0; } -// Helper function to recursively convert yyjson_val to Python dict/list -static PyObject* yyjson_val_to_python_deep(yyjson_val* val) { - if (yyjson_is_null(val)) { - Py_RETURN_NONE; - } else if (yyjson_is_bool(val)) { - if (yyjson_get_bool(val)) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } - } else if (yyjson_is_uint(val)) { - return PyLong_FromUnsignedLongLong(yyjson_get_uint(val)); - } else if (yyjson_is_int(val)) { - return PyLong_FromLongLong(yyjson_get_int(val)); - } else if (yyjson_is_real(val)) { - return PyFloat_FromDouble(yyjson_get_real(val)); - } else if (yyjson_is_str(val)) { - return PyUnicode_FromString(yyjson_get_str(val)); - } else if (yyjson_is_arr(val)) { - std::size_t idx, max; - yyjson_val* item; - PyObject* list = PyList_New(0); - if (!list) return NULL; - - yyjson_arr_foreach(val, idx, max, item) { - PyObject* py_item = yyjson_val_to_python_deep(item); - if (!py_item) { - Py_DECREF(list); - return NULL; - } - if (PyList_Append(list, py_item) < 0) { - Py_DECREF(py_item); - Py_DECREF(list); - return NULL; - } - Py_DECREF(py_item); +static PyObject *JsonDictValue_to_dict(JsonDictValueObject *self, + PyObject *Py_UNUSED(ignored)) { + PyObject *dict = PyDict_New(); + if (!dict) return NULL; + + const auto &map = get_map(self); + for (const auto &[k, v] : map.raw()) { + PyObject *key = PyUnicode_FromStringAndSize(k.data(), k.size()); + PyObject *val = args_value_to_pyobject(v); + if (!key || !val) { + Py_XDECREF(key); + Py_XDECREF(val); + Py_DECREF(dict); + return NULL; } - return list; - } else if (yyjson_is_obj(val)) { - std::size_t idx, max; - yyjson_val *key_val, *val_val; - PyObject* dict = PyDict_New(); - if (!dict) return NULL; - - yyjson_obj_foreach(val, idx, max, key_val, val_val) { - const char* key_str = yyjson_get_str(key_val); - PyObject* py_key = PyUnicode_FromString(key_str); - PyObject* py_val = yyjson_val_to_python_deep(val_val); - - if (!py_key || !py_val) { - Py_XDECREF(py_key); - Py_XDECREF(py_val); - Py_DECREF(dict); - return NULL; - } + PyDict_SetItem(dict, key, val); + Py_DECREF(key); + Py_DECREF(val); + } - if (PyDict_SetItem(dict, py_key, py_val) < 0) { - Py_DECREF(py_key); - Py_DECREF(py_val); + if (!self->is_args) { + auto &ev = self->batch->events[self->event_index]; + if (ev.args.exists()) { + PyObject *args_dict = PyDict_New(); + if (!args_dict) { Py_DECREF(dict); return NULL; } - - Py_DECREF(py_key); - Py_DECREF(py_val); - } - return dict; - } - - Py_RETURN_NONE; -} - -static PyObject* JSON_unwrap(JSONObject* self, PyObject* Py_UNUSED(ignored)) { - if (!JSON_ensure_parsed(self)) { - return NULL; - } - - yyjson_val* root = JSON_get_root(self); - return yyjson_val_to_python_deep(root); -} - -static PyObject* JSON_copy(JSONObject* self, PyObject* Py_UNUSED(ignored)) { - if (!JSON_ensure_parsed(self)) { - return NULL; - } - - // If this is a subtree wrapper, create a new wrapper pointing to the same - // subtree - if (self->root != nullptr) { - return JSON_from_yyjson_val(self->doc, self->root); - } - - // For top-level documents, we need to serialize and re-parse since - // the original json_data was modified in-place by YYJSON_READ_INSITU - yyjson_val* root = JSON_get_root(self); - if (root) { - char* json_str = yyjson_val_write(root, 0, NULL); - if (!json_str) { - PyErr_SetString(PyExc_RuntimeError, - "Failed to serialize JSON for copy"); - return NULL; - } - - size_t len = strlen(json_str); - PyObject* result = JSON_from_data(json_str, len); - free(json_str); - return result; - } - - // Empty object - return JSON_from_data("{}", 2); -} - -static PyObject* JSON_iter(JSONObject* self) { - if (!JSON_ensure_parsed(self)) { - return NULL; - } - - yyjson_val* root = yyjson_doc_get_root(self->doc); - if (!yyjson_is_obj(root)) { - return PyObject_GetIter(PyList_New(0)); - } - - return PyObject_GetIter(JSON_keys(self, NULL)); -} - -static PyObject* JSON_str(JSONObject* self) { - if (self->root != nullptr) { - char* json_str = yyjson_val_write(self->root, 0, NULL); - if (!json_str) { - return PyUnicode_FromString("{}"); + for (const auto &[k, v] : ev.args.raw()) { + PyObject *key = PyUnicode_FromStringAndSize(k.data(), k.size()); + PyObject *val = args_value_to_pyobject(v); + if (!key || !val) { + Py_XDECREF(key); + Py_XDECREF(val); + Py_DECREF(args_dict); + Py_DECREF(dict); + return NULL; + } + PyDict_SetItem(args_dict, key, val); + Py_DECREF(key); + Py_DECREF(val); + } + PyDict_SetItemString(dict, "args", args_dict); + Py_DECREF(args_dict); } - PyObject* result = PyUnicode_FromString(json_str); - free(json_str); - return result; - } - if (self->json_length > 0) { - return PyUnicode_FromStringAndSize(self->json_data, self->json_length); - } - return PyUnicode_FromString("{}"); -} - -static PyObject* JSON_repr(JSONObject* self) { - PyObject* str_obj = JSON_str(self); - if (!str_obj) return NULL; - PyObject* result = PyUnicode_FromFormat("JSON(%U)", str_obj); - Py_DECREF(str_obj); - return result; -} - -static Py_ssize_t JSON_length(JSONObject* self) { - if (!JSON_ensure_parsed(self)) { - return -1; } - yyjson_val* root = JSON_get_root(self); - if (!yyjson_is_obj(root)) { - return 0; - } - - return (Py_ssize_t)yyjson_obj_size(root); + return dict; } -static int JSON_bool(JSONObject* self) { - if (!JSON_ensure_parsed(self)) { - return -1; - } - - yyjson_val* root = JSON_get_root(self); - if (!yyjson_is_obj(root)) { - return 0; // Non-objects are falsy - } - - // Return true if object has at least one key - return yyjson_obj_size(root) > 0 ? 1 : 0; -} - -PyMethodDef JSON_methods[] = {{"__contains__", (PyCFunction)JSON_contains, - METH_O, "Check if key exists in JSON object"}, - {"keys", (PyCFunction)JSON_keys, METH_NOARGS, - "Get all keys from JSON object"}, - {"values", (PyCFunction)JSON_values, METH_NOARGS, - "Get all values from JSON object"}, - {"items", (PyCFunction)JSON_items, METH_NOARGS, - "Get all key-value pairs from JSON object"}, - {"get", (PyCFunction)JSON_get, METH_VARARGS, - "Get value by key with optional default"}, - {"unwrap", (PyCFunction)JSON_unwrap, METH_NOARGS, - "Unwrap lazy JSON to native Python dict/list"}, - {"copy", (PyCFunction)JSON_copy, METH_NOARGS, - "Return a shallow copy of the JSON object"}, - {NULL}}; - -// gcc11_bandaid: Use positional initializers instead of designated -PySequenceMethods JSON_as_sequence = { - NULL, /* sq_length */ - NULL, /* sq_concat */ - NULL, /* sq_repeat */ - NULL, /* sq_item */ - NULL, /* was_sq_slice */ - NULL, /* sq_ass_item */ - NULL, /* was_sq_ass_slice */ - JSON_contains_sq /* sq_contains */ +static PyMappingMethods JsonDictValue_as_mapping = { + (lenfunc)JsonDictValue_length, + (binaryfunc)JsonDictValue_subscript, + NULL, }; -PyMappingMethods JSON_as_mapping = { - (lenfunc)JSON_length, /* mp_length */ - (binaryfunc)JSON_getitem, /* mp_subscript */ - NULL /* mp_ass_subscript */ +static PySequenceMethods JsonDictValue_as_sequence = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, (objobjproc)JsonDictValue_contains, + NULL, NULL, }; -PyNumberMethods JSON_as_number = { - NULL, /* nb_add */ - NULL, /* nb_subtract */ - NULL, /* nb_multiply */ - NULL, /* nb_remainder */ - NULL, /* nb_divmod */ - NULL, /* nb_power */ - NULL, /* nb_negative */ - NULL, /* nb_positive */ - NULL, /* nb_absolute */ - (inquiry)JSON_bool, /* nb_bool */ +static PyMethodDef JsonDictValue_methods[] = { + {"keys", (PyCFunction)JsonDictValue_keys, METH_NOARGS, + "Return list of keys."}, + {"values", (PyCFunction)JsonDictValue_values, METH_NOARGS, + "Return list of values."}, + {"items", (PyCFunction)JsonDictValue_items, METH_NOARGS, + "Return list of (key, value) pairs."}, + {"get", (PyCFunction)JsonDictValue_get, METH_VARARGS, + "Get value by key with optional default."}, + {"to_dict", (PyCFunction)JsonDictValue_to_dict, METH_NOARGS, + "Convert to a regular Python dict."}, + {NULL}}; + +PyTypeObject JsonDictValueType = { + PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext.JsonDictValue", + sizeof(JsonDictValueObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)JsonDictValue_dealloc, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_as_async */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + &JsonDictValue_as_sequence, /* tp_as_sequence */ + &JsonDictValue_as_mapping, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Zero-copy wrapper over a parsed DFTracer JSON event.\n" + "Supports dict-like access: event['name'], event['args']['ret'].\n" + "Call .to_dict() to materialize a regular Python dict.", + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + JsonDictValue_methods, /* tp_methods */ }; -PyTypeObject JSONType = { - PyVarObject_HEAD_INIT(NULL, 0) "json.JSON", /* tp_name */ - sizeof(JSONObject), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor)JSON_dealloc, /* tp_dealloc */ - 0, /* tp_vectorcall_offset */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_as_async */ - (reprfunc)JSON_repr, /* tp_repr */ - &JSON_as_number, /* tp_as_number */ - &JSON_as_sequence, /* tp_as_sequence */ - &JSON_as_mapping, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - (reprfunc)JSON_str, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ - "JSON(json_str: str)\n" - "--\n" - "\n" - "Lazy JSON object that parses on demand using yyjson.\n" - "\n" - "Args:\n" - " json_str (str): A JSON string to wrap. Parsing is deferred\n" - " until first attribute access.\n", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - (getiterfunc)JSON_iter, /* tp_iter */ - 0, /* tp_iternext */ - JSON_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc)JSON_init, /* tp_init */ - 0, /* tp_alloc */ - JSON_new, /* tp_new */ -}; - -int init_json(PyObject* m) { - if (PyType_Ready(&JSONType) < 0) return -1; - - Py_INCREF(&JSONType); - if (PyModule_AddObject(m, "JSON", (PyObject*)&JSONType) < 0) { - Py_DECREF(&JSONType); - Py_DECREF(m); +int init_json_dict_value(PyObject *m) { + if (PyType_Ready(&JsonDictValueType) < 0) return -1; + Py_INCREF(&JsonDictValueType); + if (PyModule_AddObject(m, "JsonDictValue", (PyObject *)&JsonDictValueType) < + 0) { + Py_DECREF(&JsonDictValueType); return -1; } - return 0; } - -PyObject* JSON_from_data(const char* data, size_t length) { - JSONObject* self = - (JSONObject*)PyObject_MALLOC(sizeof(JSONObject) + length + 1); - if (!self) { - return PyErr_NoMemory(); - } - - PyObject_INIT(self, &JSONType); - - self->doc = nullptr; - self->root = nullptr; - self->parsed = false; - self->json_length = length; - self->owns_doc = true; - - std::memcpy(self->json_data, data, length); - self->json_data[length] = '\0'; - - return (PyObject*)self; -} - -// Create a JSON object wrapping a yyjson_val -// subtree (lazy wrapper for nested objects/arrays) -PyObject* JSON_from_yyjson_val(yyjson_doc* doc, yyjson_val* root) { - JSONObject* self = (JSONObject*)PyObject_MALLOC(sizeof(JSONObject)); - if (!self) { - return PyErr_NoMemory(); - } - - PyObject_INIT(self, &JSONType); - - self->doc = doc; // Share the document (don't copy) - self->root = root; // Point to the subtree - self->parsed = true; // Already parsed (just wrapping a subtree) - self->json_length = 0; // No raw JSON data - self->owns_doc = false; // Don't free the doc (it's owned by parent) - - return (PyObject*)self; -} diff --git a/src/dftracer/utils/python/json.h b/src/dftracer/utils/python/json.h index 0806e326..d87e50b1 100644 --- a/src/dftracer/utils/python/json.h +++ b/src/dftracer/utils/python/json.h @@ -2,33 +2,19 @@ #define DFTRACER_UTILS_PYTHON_JSON_H #include -#include +#include -#include -#include #include -#include typedef struct { - PyObject_HEAD mutable yyjson_doc* doc; - yyjson_val* root; - mutable bool parsed; - std::size_t json_length; - bool owns_doc; - char json_data[]; -} JSONObject; + PyObject_HEAD std::shared_ptr batch; + std::size_t event_index; + bool is_args; +} JsonDictValueObject; -extern PyTypeObject JSONType; +extern PyTypeObject JsonDictValueType; +int init_json_dict_value(PyObject *m); -extern PyMethodDef JSON_methods[]; -extern PySequenceMethods JSON_as_sequence; -extern PyMappingMethods JSON_as_mapping; - -int init_json(PyObject* m); - -PyObject* JSON_from_data(const char* data, size_t length); - -// Create a JSON object wrapping a yyjson_val subtree -PyObject* JSON_from_yyjson_val(yyjson_doc* doc, yyjson_val* root); +PyObject *args_value_to_pyobject(const ArgsValue &v); #endif // DFTRACER_UTILS_PYTHON_JSON_H diff --git a/src/dftracer/utils/python/memoryview_batch.cpp b/src/dftracer/utils/python/memoryview_batch.cpp new file mode 100644 index 00000000..460fa058 --- /dev/null +++ b/src/dftracer/utils/python/memoryview_batch.cpp @@ -0,0 +1,114 @@ +#define PY_SSIZE_T_CLEAN +#include + +#include + +namespace dftracer::utils::python { + +static void MemoryViewBatch_dealloc(MemoryViewBatchObject *self) { + delete self->data; + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static int MemoryViewBatch_getbuffer(MemoryViewBatchObject *self, + Py_buffer *view, int flags) { + if (!self->data || self->data->buffer.empty()) { + PyErr_SetString(PyExc_BufferError, "MemoryViewBatch has no data"); + return -1; + } + return PyBuffer_FillInfo(view, (PyObject *)self, self->data->buffer.data(), + static_cast(self->data->buffer.size()), + 1, flags); +} + +static Py_ssize_t MemoryViewBatch_length(MemoryViewBatchObject *self) { + if (!self->data) return 0; + return static_cast(self->data->num_entries()); +} + +PyObject *MemoryViewBatch_item(MemoryViewBatchObject *self, Py_ssize_t i) { + if (!self->data) { + PyErr_SetString(PyExc_IndexError, "MemoryViewBatch has no data"); + return NULL; + } + Py_ssize_t n = static_cast(self->data->num_entries()); + if (i < 0 || i >= n) { + PyErr_SetString(PyExc_IndexError, "MemoryViewBatch index out of range"); + return NULL; + } + + Py_buffer buf; + std::memset(&buf, 0, sizeof(buf)); + buf.buf = self->data->buffer.data() + self->data->offsets[i]; + buf.obj = (PyObject *)self; + Py_INCREF(self); + buf.len = self->data->lengths[i]; + buf.itemsize = 1; + buf.readonly = 1; + buf.ndim = 1; + buf.format = const_cast("B"); + buf.shape = &buf.len; + buf.strides = &buf.itemsize; + buf.suboffsets = NULL; + buf.internal = NULL; + return PyMemoryView_FromBuffer(&buf); +} + +static PyBufferProcs MemoryViewBatch_as_buffer = { + (getbufferproc)MemoryViewBatch_getbuffer, + NULL, +}; + +static PySequenceMethods MemoryViewBatch_as_sequence = { + (lenfunc)MemoryViewBatch_length, + NULL, + NULL, + (ssizeargfunc)MemoryViewBatch_item, +}; + +static PyObject *MemoryViewBatch_get_num_entries(MemoryViewBatchObject *self, + void *) { + if (!self->data) return PyLong_FromLong(0); + return PyLong_FromSsize_t( + static_cast(self->data->num_entries())); +} + +static PyObject *MemoryViewBatch_get_num_bytes(MemoryViewBatchObject *self, + void *) { + if (!self->data) return PyLong_FromLong(0); + return PyLong_FromSsize_t( + static_cast(self->data->buffer.size())); +} + +static PyGetSetDef MemoryViewBatch_getsetters[] = { + {"num_entries", (getter)MemoryViewBatch_get_num_entries, NULL, + "Number of entries", NULL}, + {"num_bytes", (getter)MemoryViewBatch_get_num_bytes, NULL, + "Total buffer size in bytes", NULL}, + {NULL}}; + +PyTypeObject MemoryViewBatchType = { + .ob_base = PyVarObject_HEAD_INIT(NULL, 0).tp_name = + "dftracer_utils_ext._MemoryViewBatch", + .tp_basicsize = sizeof(MemoryViewBatchObject), + .tp_itemsize = 0, + .tp_dealloc = (destructor)MemoryViewBatch_dealloc, + .tp_as_sequence = &MemoryViewBatch_as_sequence, + .tp_as_buffer = &MemoryViewBatch_as_buffer, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_doc = "Zero-copy batch of byte entries backed by a contiguous buffer", + .tp_getset = MemoryViewBatch_getsetters, +}; + +int init_memoryview_batch(PyObject *m) { + if (PyType_Ready(&MemoryViewBatchType) < 0) return -1; + Py_INCREF(&MemoryViewBatchType); + if (PyModule_AddObject(m, "_MemoryViewBatch", + (PyObject *)&MemoryViewBatchType) < 0) { + Py_DECREF(&MemoryViewBatchType); + return -1; + } + return 0; +} + +} // namespace dftracer::utils::python diff --git a/src/dftracer/utils/python/memoryview_batch.h b/src/dftracer/utils/python/memoryview_batch.h new file mode 100644 index 00000000..67fef45c --- /dev/null +++ b/src/dftracer/utils/python/memoryview_batch.h @@ -0,0 +1,54 @@ +#ifndef DFTRACER_UTILS_PYTHON_MEMORYVIEW_BATCH_H +#define DFTRACER_UTILS_PYTHON_MEMORYVIEW_BATCH_H + +#define PY_SSIZE_T_CLEAN +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::python { + +struct MemoryViewBatchData { + std::vector buffer; + std::vector offsets; + std::vector lengths; + + std::size_t num_entries() const { return offsets.size(); } +}; + +struct MemoryViewBatchObject { + PyObject_HEAD MemoryViewBatchData *data; +}; + +extern PyTypeObject MemoryViewBatchType; + +PyObject *MemoryViewBatch_item(MemoryViewBatchObject *self, Py_ssize_t i); + +struct MemoryViewBatchIteratorState { + std::shared_ptr> + channel; + std::mutex error_mtx; + std::exception_ptr error; + std::atomic cancelled{false}; + std::size_t memory_budget_bytes = 0; + std::atomic bytes_in_queue{0}; + std::shared_future task_future; + + void set_error(std::exception_ptr e) { + std::lock_guard lock(error_mtx); + if (!error) error = e; + } +}; + +int init_memoryview_batch(PyObject *m); + +} // namespace dftracer::utils::python + +#endif // DFTRACER_UTILS_PYTHON_MEMORYVIEW_BATCH_H diff --git a/src/dftracer/utils/python/runtime.cpp b/src/dftracer/utils/python/runtime.cpp index 6e6272f4..e5c63993 100644 --- a/src/dftracer/utils/python/runtime.cpp +++ b/src/dftracer/utils/python/runtime.cpp @@ -30,11 +30,12 @@ static PyObject *Runtime_new(PyTypeObject *type, PyObject *args, } static int Runtime_init(RuntimeObject *self, PyObject *args, PyObject *kwds) { - static const char *kwlist[] = {"threads", NULL}; + static const char *kwlist[] = {"threads", "io_threads", NULL}; Py_ssize_t threads = 0; + Py_ssize_t io_threads = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|n", (char **)kwlist, - &threads)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nn", (char **)kwlist, + &threads, &io_threads)) { return -1; } @@ -42,10 +43,17 @@ static int Runtime_init(RuntimeObject *self, PyObject *args, PyObject *kwds) { PyErr_SetString(PyExc_ValueError, "threads must be >= 0"); return -1; } + if (io_threads < 0) { + PyErr_SetString(PyExc_ValueError, "io_threads must be >= 0"); + return -1; + } try { - self->runtime = std::make_shared( - static_cast(threads)); + dftracer::utils::ExecutorConfig config; + config.num_threads = static_cast(threads); + config.io_pool_size = static_cast(io_threads); + self->runtime = + std::make_shared(config, true); } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return -1; @@ -393,9 +401,19 @@ static PyMethodDef Runtime_methods[] = { "Exit context manager (calls shutdown)."}, {NULL}}; +static PyObject *Runtime_get_io_threads(RuntimeObject *self, void *closure) { + if (!self->runtime) { + PyErr_SetString(PyExc_RuntimeError, "Runtime not initialized"); + return NULL; + } + return PyLong_FromSize_t(self->runtime->io_threads()); +} + static PyGetSetDef Runtime_getsetters[] = { {"threads", (getter)Runtime_get_threads, NULL, "Number of worker threads", NULL}, + {"io_threads", (getter)Runtime_get_io_threads, NULL, + "Number of I/O threads", NULL}, {NULL}}; PyTypeObject RuntimeType = { @@ -418,13 +436,15 @@ PyTypeObject RuntimeType = { 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ - "Runtime(threads: int = 0)\n" + "Runtime(threads: int = 0, io_threads: int = 0)\n" "--\n" "\n" "Coroutine runtime backed by a thread pool.\n" "\n" "Args:\n" " threads (int): Number of worker threads. 0 (default) uses\n" + " the hardware concurrency.\n" + " io_threads (int): Number of I/O threads. 0 (default) uses\n" " the hardware concurrency.\n", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ diff --git a/src/dftracer/utils/python/schema_reconcile.cpp b/src/dftracer/utils/python/schema_reconcile.cpp new file mode 100644 index 00000000..21b8e783 --- /dev/null +++ b/src/dftracer/utils/python/schema_reconcile.cpp @@ -0,0 +1,351 @@ +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW + +#include + +#include +#include +#include + +namespace dftracer::utils::python { + +namespace { + +bool cstr_eq(const char *a, const char *b) { + if (a == b) return true; + if (!a || !b) return false; + return std::strcmp(a, b) == 0; +} + +// Unknown formats fall back to NA so we can still emit a safe null column. +ArrowType type_from_format(const ArrowSchema *s) { + if (!s || !s->format) return NANOARROW_TYPE_NA; + const char *f = s->format; + if (cstr_eq(f, "n")) return NANOARROW_TYPE_NA; + if (cstr_eq(f, "b")) return NANOARROW_TYPE_BOOL; + if (cstr_eq(f, "c")) return NANOARROW_TYPE_INT8; + if (cstr_eq(f, "s")) return NANOARROW_TYPE_INT16; + if (cstr_eq(f, "i")) return NANOARROW_TYPE_INT32; + if (cstr_eq(f, "l")) return NANOARROW_TYPE_INT64; + if (cstr_eq(f, "C")) return NANOARROW_TYPE_UINT8; + if (cstr_eq(f, "S")) return NANOARROW_TYPE_UINT16; + if (cstr_eq(f, "I")) return NANOARROW_TYPE_UINT32; + if (cstr_eq(f, "L")) return NANOARROW_TYPE_UINT64; + if (cstr_eq(f, "f")) return NANOARROW_TYPE_FLOAT; + if (cstr_eq(f, "g")) return NANOARROW_TYPE_DOUBLE; + if (cstr_eq(f, "u")) return NANOARROW_TYPE_STRING; + if (cstr_eq(f, "z")) return NANOARROW_TYPE_BINARY; + if (cstr_eq(f, "U")) return NANOARROW_TYPE_LARGE_STRING; + if (cstr_eq(f, "Z")) return NANOARROW_TYPE_LARGE_BINARY; + return NANOARROW_TYPE_NA; +} + +int build_null_array(const ArrowSchema *child_schema, int64_t length, + ArrowArray *out) { + ArrowError err; + ArrowErrorInit(&err); + ArrowType t = type_from_format(child_schema); + if (ArrowArrayInitFromType(out, t) != NANOARROW_OK) return -1; + if (ArrowArrayStartAppending(out) != NANOARROW_OK) return -1; + if (ArrowArrayAppendNull(out, length) != NANOARROW_OK) return -1; + if (ArrowArrayFinishBuildingDefault(out, &err) != NANOARROW_OK) return -1; + return 0; +} + +void json_escape(std::string_view in, std::string &out) { + for (char c : in) { + switch (c) { + case '"': + out.append("\\\""); + break; + case '\\': + out.append("\\\\"); + break; + case '\n': + out.append("\\n"); + break; + case '\r': + out.append("\\r"); + break; + case '\t': + out.append("\\t"); + break; + default: + if (static_cast(c) < 0x20) { + char buf[8]; + std::snprintf( + buf, sizeof(buf), "\\u%04x", + static_cast(static_cast(c))); + out.append(buf); + } else { + out.push_back(c); + } + } + } +} + +void append_json_scalar(const ArrowSchema *child_schema, + const ArrowArray *child_array, int64_t row, + std::string &out) { + if (!child_schema || !child_array) { + out.append("null"); + return; + } + ArrowArrayView view; + ArrowArrayViewInitFromType(&view, type_from_format(child_schema)); + ArrowError err; + ArrowErrorInit(&err); + if (ArrowArrayViewSetArray(&view, child_array, &err) != NANOARROW_OK) { + out.append("null"); + ArrowArrayViewReset(&view); + return; + } + if (ArrowArrayViewIsNull(&view, row)) { + out.append("null"); + ArrowArrayViewReset(&view); + return; + } + ArrowType t = type_from_format(child_schema); + switch (t) { + case NANOARROW_TYPE_BOOL: + out.append(ArrowArrayViewGetIntUnsafe(&view, row) ? "true" + : "false"); + break; + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT64: { + char buf[32]; + std::snprintf( + buf, sizeof(buf), "%lld", + static_cast(ArrowArrayViewGetIntUnsafe(&view, row))); + out.append(buf); + break; + } + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT64: { + char buf[32]; + std::snprintf(buf, sizeof(buf), "%llu", + static_cast( + ArrowArrayViewGetUIntUnsafe(&view, row))); + out.append(buf); + break; + } + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: { + char buf[32]; + std::snprintf(buf, sizeof(buf), "%g", + ArrowArrayViewGetDoubleUnsafe(&view, row)); + out.append(buf); + break; + } + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: { + auto sv = ArrowArrayViewGetStringUnsafe(&view, row); + out.push_back('"'); + json_escape(std::string_view(sv.data, sv.size_bytes), out); + out.push_back('"'); + break; + } + default: + out.append("null"); + } + ArrowArrayViewReset(&view); +} + +} // namespace + +SchemaReconciler::SchemaReconciler() = default; + +bool SchemaReconciler::merge(const ArrowSchema *incoming) { + if (finalized_ || !incoming) return false; + bool added = false; + for (int64_t i = 0; i < incoming->n_children; ++i) { + const ArrowSchema *child = incoming->children[i]; + if (!child || !child->name) continue; + std::string name(child->name); + if (name == EXTRA_COLUMN_NAME) continue; // reserved + if (name_to_idx_.count(name)) continue; + nanoarrow::UniqueSchema copy; + if (ArrowSchemaDeepCopy(child, copy.get()) != NANOARROW_OK) { + last_error_ = "schema deep-copy failed while merging"; + return added; + } + int64_t idx = static_cast(names_.size()); + names_.push_back(name); + child_schemas_.push_back(std::move(copy)); + name_to_idx_.emplace(std::move(name), idx); + added = true; + } + return added; +} + +int SchemaReconciler::finalize() { + if (finalized_) return 0; + int64_t n = static_cast(child_schemas_.size()) + 1; + ArrowSchemaInit(locked_schema_.get()); + if (ArrowSchemaSetTypeStruct(locked_schema_.get(), n) != NANOARROW_OK) { + last_error_ = "failed to initialize union struct schema"; + return -1; + } + for (size_t i = 0; i < child_schemas_.size(); ++i) { + nanoarrow::UniqueSchema tmp; + if (ArrowSchemaDeepCopy(child_schemas_[i].get(), tmp.get()) != + NANOARROW_OK) { + last_error_ = "failed to deep-copy union child"; + return -1; + } + ArrowSchemaMove(tmp.get(), locked_schema_->children[i]); + } + ArrowSchema *extra = locked_schema_->children[child_schemas_.size()]; + if (ArrowSchemaSetType(extra, NANOARROW_TYPE_STRING) != NANOARROW_OK) { + last_error_ = "failed to set _extra column type"; + return -1; + } + if (ArrowSchemaSetName(extra, EXTRA_COLUMN_NAME) != NANOARROW_OK) { + last_error_ = "failed to name _extra column"; + return -1; + } + finalized_ = true; + return 0; +} + +int SchemaReconciler::copy_schema(ArrowSchema *out) const { + if (!finalized_) { + last_error_ = "copy_schema called before finalize"; + return -1; + } + nanoarrow::UniqueSchema tmp; + if (ArrowSchemaDeepCopy(locked_schema_.get(), tmp.get()) != NANOARROW_OK) { + last_error_ = "failed to deep-copy locked schema"; + return -1; + } + ArrowSchemaMove(tmp.get(), out); + return 0; +} + +int SchemaReconciler::reconcile(const ArrowSchema *in_schema, + ArrowArray *in_array, ArrowArray *out) const { + if (!finalized_) { + last_error_ = "reconcile called before finalize"; + return -1; + } + if (!in_schema || !in_array || !out) return -1; + + int64_t num_rows = in_array->length; + + // Initialize out as a struct matching the locked schema. This allocates + // children of the right types; we'll populate them below. + ArrowError err; + ArrowErrorInit(&err); + if (ArrowArrayInitFromSchema(out, locked_schema_.get(), &err) != + NANOARROW_OK) { + last_error_ = "ArrowArrayInitFromSchema failed for reconciled array"; + return -1; + } + + // Build: input-name -> input-child-index + std::unordered_map in_idx; + in_idx.reserve(static_cast(in_schema->n_children)); + for (int64_t i = 0; i < in_schema->n_children; ++i) { + const ArrowSchema *c = in_schema->children[i]; + if (c && c->name) in_idx.emplace(c->name, i); + } + + // For each known union column (all except the final _extra), try to take + // it from the input batch. If missing, null-pad. + int64_t n_known = num_known_columns(); + for (int64_t i = 0; i < n_known; ++i) { + const std::string &name = names_[static_cast(i)]; + auto it = in_idx.find(name); + if (it != in_idx.end()) { + // Release the pre-initialized placeholder child and move the + // input child into its slot (zero copy; release of the input + // goes null after the move). + ArrowArray *slot = out->children[i]; + if (slot->release) slot->release(slot); + ArrowArrayMove(in_array->children[it->second], slot); + } else { + ArrowArray *slot = out->children[i]; + if (slot->release) slot->release(slot); + if (build_null_array(locked_schema_->children[i], num_rows, slot) != + 0) { + last_error_ = "failed to build null column for missing field"; + return -1; + } + } + } + + // Find input children whose names aren't in the union: these feed _extra. + std::vector unknown_in; + for (int64_t i = 0; i < in_schema->n_children; ++i) { + const ArrowSchema *c = in_schema->children[i]; + if (!c || !c->name) continue; + if (!name_to_idx_.count(c->name)) unknown_in.push_back(i); + } + + // Build the _extra column. Fast path: no unknowns -> all nulls. + ArrowArray *extra_slot = out->children[n_known]; + if (extra_slot->release) extra_slot->release(extra_slot); + if (unknown_in.empty()) { + if (ArrowArrayInitFromType(extra_slot, NANOARROW_TYPE_STRING) != + NANOARROW_OK) { + last_error_ = "failed to init null _extra column"; + return -1; + } + if (ArrowArrayStartAppending(extra_slot) != NANOARROW_OK || + ArrowArrayAppendNull(extra_slot, num_rows) != NANOARROW_OK || + ArrowArrayFinishBuildingDefault(extra_slot, &err) != NANOARROW_OK) { + last_error_ = "failed to append nulls to _extra"; + return -1; + } + } else { + // Slow path: JSON-encode unknown fields per row. + if (ArrowArrayInitFromType(extra_slot, NANOARROW_TYPE_STRING) != + NANOARROW_OK) { + last_error_ = "failed to init string _extra column"; + return -1; + } + if (ArrowArrayStartAppending(extra_slot) != NANOARROW_OK) { + last_error_ = "failed to start appending to _extra"; + return -1; + } + std::string buf; + for (int64_t row = 0; row < num_rows; ++row) { + buf.clear(); + buf.push_back('{'); + bool first = true; + for (int64_t u : unknown_in) { + const ArrowSchema *cs = in_schema->children[u]; + const ArrowArray *ca = in_array->children[u]; + if (!cs || !ca || !cs->name) continue; + if (!first) buf.push_back(','); + first = false; + buf.push_back('"'); + json_escape(cs->name, buf); + buf.append("\":"); + append_json_scalar(cs, ca, row, buf); + } + buf.push_back('}'); + ArrowStringView sv{buf.data(), static_cast(buf.size())}; + if (ArrowArrayAppendString(extra_slot, sv) != NANOARROW_OK) { + last_error_ = "failed to append _extra row"; + return -1; + } + } + if (ArrowArrayFinishBuildingDefault(extra_slot, &err) != NANOARROW_OK) { + last_error_ = "failed to finish _extra column"; + return -1; + } + } + + out->length = num_rows; + out->null_count = 0; + return 0; +} + +} // namespace dftracer::utils::python + +#endif // DFTRACER_UTILS_ENABLE_ARROW diff --git a/src/dftracer/utils/python/schema_reconcile.h b/src/dftracer/utils/python/schema_reconcile.h new file mode 100644 index 00000000..452ac507 --- /dev/null +++ b/src/dftracer/utils/python/schema_reconcile.h @@ -0,0 +1,49 @@ +#ifndef DFTRACER_UTILS_PYTHON_SCHEMA_RECONCILE_H +#define DFTRACER_UTILS_PYTHON_SCHEMA_RECONCILE_H + +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW + +#include + +#include +#include +#include +#include + +namespace dftracer::utils::python { + +// Build a union schema over batches from producers that each emit a subset +// of columns; after finalize(), surprise columns are JSON-encoded into +// _extra so no data is lost and the stream schema stays stable. +class SchemaReconciler { + public: + static constexpr const char *EXTRA_COLUMN_NAME = "_extra"; + + SchemaReconciler(); + + bool merge(const ArrowSchema *incoming); + int finalize(); + int copy_schema(ArrowSchema *out) const; + int reconcile(const ArrowSchema *in_schema, ArrowArray *in_array, + ArrowArray *out) const; + + bool finalized() const { return finalized_; } + int64_t num_known_columns() const { + return static_cast(child_schemas_.size()); + } + const std::string &last_error() const { return last_error_; } + + private: + std::vector names_; + std::vector child_schemas_; + std::unordered_map name_to_idx_; + nanoarrow::UniqueSchema locked_schema_; + bool finalized_ = false; + mutable std::string last_error_; +}; + +} // namespace dftracer::utils::python + +#endif // DFTRACER_UTILS_ENABLE_ARROW +#endif // DFTRACER_UTILS_PYTHON_SCHEMA_RECONCILE_H diff --git a/src/dftracer/utils/python/sst_distribution.cpp b/src/dftracer/utils/python/sst_distribution.cpp new file mode 100644 index 00000000..dc80b43c --- /dev/null +++ b/src/dftracer/utils/python/sst_distribution.cpp @@ -0,0 +1,1182 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using dftracer::utils::Runtime; +using dftracer::utils::utilities::filesystem::FileEntry; +using dftracer::utils::utilities::filesystem::PatternDirectoryScannerUtility; +using dftracer::utils::utilities::filesystem:: + PatternDirectoryScannerUtilityInput; +using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBatchSink; +using dftracer::utils::utilities::indexer::IndexBuildBatchConfig; +using dftracer::utils::utilities::indexer::IndexBuildBatchResult; +using dftracer::utils::utilities::indexer::IndexDatabaseSstWriterContext; +using dftracer::utils::utilities::indexer::plan_lpt_partition; +using dftracer::utils::utilities::indexer::SstArtifactRegistry; +using dftracer::utils::utilities::indexer::internal:: + enumerate_gzip_member_candidates; +using dftracer::utils::utilities::indexer::internal::GzipMember; + +// --------------------------------------------------------------------------- +// SstArtifactRegistry type +// --------------------------------------------------------------------------- + +typedef struct { + PyObject_HEAD std::shared_ptr registry; +} SstArtifactRegistryObject; + +static void SstArtifactRegistry_dealloc(SstArtifactRegistryObject *self) { + self->registry.~shared_ptr(); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject *SstArtifactRegistry_new(PyTypeObject *type, + PyObject * /*args*/, + PyObject * /*kwds*/) { + auto *self = (SstArtifactRegistryObject *)type->tp_alloc(type, 0); + if (!self) return NULL; + new (&self->registry) std::shared_ptr( + std::make_shared()); + return (PyObject *)self; +} + +namespace { + +// Field names in the Artifacts dict returned by build_sst_batch and +// consumed by SstArtifactRegistry.append. Must match the field names on +// IndexDatabaseSstWriterContext::Artifacts. +constexpr const char *ARTIFACT_FIELDS[] = { + "metadata_sst", "checkpoints_sst", "manifest_sst", + "chunk_bloom_sst", "file_bloom_sst", "chunk_stats_sst", + "chunk_dim_stats_sst", "dimensions_sst", "file_scalar_stats_sst", + "file_cat_counts_sst", "file_pid_tid_counts_sst", "file_name_counts_sst", + "name_dictionary_sst", "name_file_postings_sst", "name_chunk_postings_sst", + "hash_tables_sst", "aggregation_sst", "system_metrics_sst", +}; + +/// Map a slot name to the matching Artifacts member. Kept in one place so +/// that adding a new CF requires updating only `ARTIFACT_FIELDS` plus +/// `dispatch_*` below. +std::optional *artifacts_slot( + IndexDatabaseSstWriterContext::Artifacts &a, std::string_view name) { + if (name == "metadata_sst") return &a.metadata_sst; + if (name == "checkpoints_sst") return &a.checkpoints_sst; + if (name == "manifest_sst") return &a.manifest_sst; + if (name == "chunk_bloom_sst") return &a.chunk_bloom_sst; + if (name == "file_bloom_sst") return &a.file_bloom_sst; + if (name == "chunk_stats_sst") return &a.chunk_stats_sst; + if (name == "chunk_dim_stats_sst") return &a.chunk_dim_stats_sst; + if (name == "dimensions_sst") return &a.dimensions_sst; + if (name == "file_scalar_stats_sst") return &a.file_scalar_stats_sst; + if (name == "file_cat_counts_sst") return &a.file_cat_counts_sst; + if (name == "file_pid_tid_counts_sst") return &a.file_pid_tid_counts_sst; + if (name == "file_name_counts_sst") return &a.file_name_counts_sst; + if (name == "name_dictionary_sst") return &a.name_dictionary_sst; + if (name == "name_file_postings_sst") return &a.name_file_postings_sst; + if (name == "name_chunk_postings_sst") return &a.name_chunk_postings_sst; + if (name == "hash_tables_sst") return &a.hash_tables_sst; + if (name == "aggregation_sst") return &a.aggregation_sst; + if (name == "system_metrics_sst") return &a.system_metrics_sst; + return nullptr; +} + +/// Convert a Python artifacts dict to the C++ Artifacts struct. Missing, +/// None, or empty-string entries become nullopt. Returns false on type +/// errors (exception set). +bool artifacts_from_dict(PyObject *dict, + IndexDatabaseSstWriterContext::Artifacts *out) { + if (!PyDict_Check(dict)) { + PyErr_SetString(PyExc_TypeError, "artifacts must be a dict"); + return false; + } + for (const char *field : ARTIFACT_FIELDS) { + PyObject *val = PyDict_GetItemString(dict, field); // borrowed + if (!val || val == Py_None) continue; + if (!PyUnicode_Check(val)) { + PyErr_Format(PyExc_TypeError, "artifacts['%s'] must be str or None", + field); + return false; + } + const char *s = PyUnicode_AsUTF8(val); + if (!s) return false; + if (s[0] == '\0') continue; + auto *slot = artifacts_slot(*out, field); + if (slot) *slot = std::string(s); + } + return true; +} + +PyObject *artifacts_to_dict(const IndexDatabaseSstWriterContext::Artifacts &a) { + PyObject *dict = PyDict_New(); + if (!dict) return NULL; + auto set_field = [&](const char *name, + const std::optional &slot) -> bool { + PyObject *v = slot.has_value() ? PyUnicode_FromString(slot->c_str()) + : (Py_INCREF(Py_None), Py_None); + if (!v) return false; + int rc = PyDict_SetItemString(dict, name, v); + Py_DECREF(v); + return rc == 0; + }; + if (!set_field("metadata_sst", a.metadata_sst) || + !set_field("checkpoints_sst", a.checkpoints_sst) || + !set_field("manifest_sst", a.manifest_sst) || + !set_field("chunk_bloom_sst", a.chunk_bloom_sst) || + !set_field("file_bloom_sst", a.file_bloom_sst) || + !set_field("chunk_stats_sst", a.chunk_stats_sst) || + !set_field("chunk_dim_stats_sst", a.chunk_dim_stats_sst) || + !set_field("dimensions_sst", a.dimensions_sst) || + !set_field("file_scalar_stats_sst", a.file_scalar_stats_sst) || + !set_field("file_cat_counts_sst", a.file_cat_counts_sst) || + !set_field("file_pid_tid_counts_sst", a.file_pid_tid_counts_sst) || + !set_field("file_name_counts_sst", a.file_name_counts_sst) || + !set_field("name_dictionary_sst", a.name_dictionary_sst) || + !set_field("name_file_postings_sst", a.name_file_postings_sst) || + !set_field("name_chunk_postings_sst", a.name_chunk_postings_sst) || + !set_field("hash_tables_sst", a.hash_tables_sst) || + !set_field("aggregation_sst", a.aggregation_sst) || + !set_field("system_metrics_sst", a.system_metrics_sst)) { + Py_DECREF(dict); + return NULL; + } + return dict; +} + +} // namespace + +static PyObject *SstArtifactRegistry_append(SstArtifactRegistryObject *self, + PyObject *args) { + PyObject *dict; + if (!PyArg_ParseTuple(args, "O", &dict)) return NULL; + IndexDatabaseSstWriterContext::Artifacts a; + if (!artifacts_from_dict(dict, &a)) return NULL; + self->registry->append(std::move(a)); + Py_RETURN_NONE; +} + +static PyMethodDef SstArtifactRegistry_methods[] = { + {"append", (PyCFunction)SstArtifactRegistry_append, METH_VARARGS, + "append(artifacts_dict) -> None\n" + "Add a per-batch Artifacts dict (as returned by build_sst_batch or " + "IndexDatabaseSstWriterContext.commit) to the registry."}, + {NULL}}; + +static PyTypeObject SstArtifactRegistryType = { + PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext.SstArtifactRegistry", + sizeof(SstArtifactRegistryObject), + 0, + (destructor)SstArtifactRegistry_dealloc, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + Py_TPFLAGS_DEFAULT, + "Thread-safe collector for SST artifact paths.", + 0, + 0, + 0, + 0, + 0, + 0, + SstArtifactRegistry_methods, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + SstArtifactRegistry_new, +}; + +SstArtifactRegistry *sst_artifact_registry_get(PyObject *obj) { + if (!PyObject_TypeCheck(obj, &SstArtifactRegistryType)) return nullptr; + return ((SstArtifactRegistryObject *)obj)->registry.get(); +} + +// --------------------------------------------------------------------------- +// scan_files: parallel directory scan with size info +// --------------------------------------------------------------------------- + +static PyObject *scan_files_fn(PyObject * /*self*/, PyObject *args, + PyObject *kwds) { + static const char *kwlist[] = {"directory", "patterns", "recursive", + "runtime", NULL}; + const char *directory; + PyObject *patterns_obj = NULL; + int recursive = 0; + PyObject *runtime_arg = NULL; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|OpO", (char **)kwlist, + &directory, &patterns_obj, &recursive, + &runtime_arg)) { + return NULL; + } + + std::vector patterns; + if (patterns_obj && patterns_obj != Py_None) { + PyObject *seq = + PySequence_Fast(patterns_obj, "patterns must be a sequence"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + patterns.reserve(n); + for (Py_ssize_t i = 0; i < n; ++i) { + const char *s = PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i)); + if (!s) { + Py_DECREF(seq); + return NULL; + } + patterns.emplace_back(s); + } + Py_DECREF(seq); + } + + Runtime *rt = nullptr; + if (runtime_arg && runtime_arg != Py_None) { + if (!PyObject_TypeCheck(runtime_arg, &RuntimeType)) { + PyObject *native = PyObject_GetAttrString(runtime_arg, "_native"); + if (!native || !PyObject_TypeCheck(native, &RuntimeType)) { + Py_XDECREF(native); + PyErr_SetString(PyExc_TypeError, + "runtime must be a Runtime instance or None"); + return NULL; + } + rt = ((RuntimeObject *)native)->runtime.get(); + Py_DECREF(native); + } else { + rt = ((RuntimeObject *)runtime_arg)->runtime.get(); + } + } else { + rt = get_default_runtime(); + } + + PatternDirectoryScannerUtilityInput input(directory, patterns, + recursive != 0, true); + std::vector entries; + try { + Py_BEGIN_ALLOW_THREADS rt + ->submit(dftracer::utils::run_coro_scope( + rt->executor(), + [](dftracer::utils::CoroScope &scope, + PatternDirectoryScannerUtilityInput in, + std::vector *out) + -> dftracer::utils::coro::CoroTask { + PatternDirectoryScannerUtility scanner; + *out = co_await scope.spawn(scanner, in); + }, + std::move(input), &entries), + "scan-files") + .get(); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + + PyObject *out = PyList_New(static_cast(entries.size())); + if (!out) return NULL; + for (std::size_t i = 0; i < entries.size(); ++i) { + PyObject *t = Py_BuildValue("(sn)", entries[i].path.c_str(), + (Py_ssize_t)entries[i].size); + if (!t) { + Py_DECREF(out); + return NULL; + } + PyList_SET_ITEM(out, i, t); + } + return out; +} + +// --------------------------------------------------------------------------- +// plan_lpt_partition: LPT bin-packing of (path, size) pairs +// --------------------------------------------------------------------------- + +static PyObject *plan_lpt_partition_fn(PyObject * /*self*/, PyObject *args) { + PyObject *entries_obj; + Py_ssize_t num_workers; + if (!PyArg_ParseTuple(args, "On", &entries_obj, &num_workers)) return NULL; + if (num_workers <= 0) num_workers = 1; + + std::vector entries; + PyObject *seq = PySequence_Fast(entries_obj, + "entries must be a sequence of " + "(path, size) tuples"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + entries.reserve(n); + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject *item = PySequence_Fast_GET_ITEM(seq, i); + const char *path = nullptr; + Py_ssize_t size = 0; + if (!PyArg_ParseTuple(item, "sn", &path, &size)) { + Py_DECREF(seq); + return NULL; + } + FileEntry fe; + fe.path = path; + fe.size = static_cast(size); + fe.is_regular_file = true; + entries.push_back(std::move(fe)); + } + Py_DECREF(seq); + + auto buckets = plan_lpt_partition(std::move(entries), + static_cast(num_workers)); + + PyObject *out = PyList_New(static_cast(buckets.size())); + if (!out) return NULL; + for (std::size_t i = 0; i < buckets.size(); ++i) { + PyObject *lst = PyList_New(static_cast(buckets[i].size())); + if (!lst) { + Py_DECREF(out); + return NULL; + } + for (std::size_t j = 0; j < buckets[i].size(); ++j) { + PyObject *t = Py_BuildValue("(sn)", buckets[i][j].path.c_str(), + (Py_ssize_t)buckets[i][j].size); + if (!t) { + Py_DECREF(lst); + Py_DECREF(out); + return NULL; + } + PyList_SET_ITEM(lst, j, t); + } + PyList_SET_ITEM(out, i, lst); + } + return out; +} + +// --------------------------------------------------------------------------- +// build_sst_batch: run the indexer pipeline with an SST sink and return +// the merged Artifacts dict. +// --------------------------------------------------------------------------- + +static PyObject *build_sst_batch_fn(PyObject * /*self*/, PyObject *args, + PyObject *kwds) { + static const char *kwlist[] = {"files", + "file_ids", + "staging_dir", + "batch_id", + "index_dir", + "checkpoint_size", + "build_manifest", + "force_rebuild", + "bloom_dimensions", + "parallelism", + "flush_every_files", + "runtime", + "aggregation_config", + "file_slices", + NULL}; + PyObject *files_obj; + PyObject *file_ids_obj; + const char *staging_dir; + const char *batch_id; + const char *index_dir = ""; + Py_ssize_t checkpoint_size = 32 * 1024 * 1024; + int build_manifest = 0; + int force_rebuild = 0; + PyObject *bloom_dims_obj = NULL; + Py_ssize_t parallelism = 0; + Py_ssize_t flush_every_files = 0; + PyObject *runtime_arg = NULL; + PyObject *aggregation_config_obj = NULL; + PyObject *file_slices_obj = NULL; + + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "OOss|snppOnnOOO", (char **)kwlist, &files_obj, + &file_ids_obj, &staging_dir, &batch_id, &index_dir, + &checkpoint_size, &build_manifest, &force_rebuild, &bloom_dims_obj, + ¶llelism, &flush_every_files, &runtime_arg, + &aggregation_config_obj, &file_slices_obj)) { + return NULL; + } + + // Unpack files. + std::vector files; + { + PyObject *seq = PySequence_Fast(files_obj, "files must be a sequence"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + files.reserve(n); + for (Py_ssize_t i = 0; i < n; ++i) { + const char *s = PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i)); + if (!s) { + Py_DECREF(seq); + return NULL; + } + files.emplace_back(s); + } + Py_DECREF(seq); + } + if (files.empty()) { + return PyDict_New(); + } + + // Unpack file_ids, parallel to files. + std::vector file_ids; + { + PyObject *seq = + PySequence_Fast(file_ids_obj, "file_ids must be a sequence"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + if (static_cast(n) != files.size()) { + Py_DECREF(seq); + PyErr_SetString(PyExc_ValueError, + "file_ids must have the same length as files"); + return NULL; + } + file_ids.reserve(n); + for (Py_ssize_t i = 0; i < n; ++i) { + long v = PyLong_AsLong(PySequence_Fast_GET_ITEM(seq, i)); + if (v == -1 && PyErr_Occurred()) { + Py_DECREF(seq); + return NULL; + } + file_ids.push_back(static_cast(v)); + } + Py_DECREF(seq); + } + + // Optional bloom dimensions override. + std::vector bloom_dims; + if (bloom_dims_obj && bloom_dims_obj != Py_None) { + PyObject *seq = PySequence_Fast(bloom_dims_obj, + "bloom_dimensions must be a sequence"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + bloom_dims.reserve(n); + for (Py_ssize_t i = 0; i < n; ++i) { + const char *s = PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i)); + if (!s) { + Py_DECREF(seq); + return NULL; + } + bloom_dims.emplace_back(s); + } + Py_DECREF(seq); + } + + // Resolve Runtime (matching CheckpointIndexer pattern). + Runtime *rt = nullptr; + if (runtime_arg && runtime_arg != Py_None) { + if (PyObject_TypeCheck(runtime_arg, &RuntimeType)) { + rt = ((RuntimeObject *)runtime_arg)->runtime.get(); + } else { + PyObject *native = PyObject_GetAttrString(runtime_arg, "_native"); + if (!native || !PyObject_TypeCheck(native, &RuntimeType)) { + Py_XDECREF(native); + PyErr_SetString(PyExc_TypeError, + "runtime must be a Runtime instance or None"); + return NULL; + } + rt = ((RuntimeObject *)native)->runtime.get(); + Py_DECREF(native); + } + } else { + rt = get_default_runtime(); + } + + // Build config + sink factory shared state. + struct SharedArtifacts { + std::mutex mu; + std::vector list; + }; + auto artifacts = std::make_shared(); + auto staging = std::string(staging_dir); + auto batch = std::string(batch_id); + + // Optional aggregation config, extracted from the Python dataclass. + std::shared_ptr + agg_config_ptr; + if (aggregation_config_obj && aggregation_config_obj != Py_None) { + using dftracer::utils::utilities::composites::dft::aggregators:: + AggregationConfig; + auto cfg = std::make_shared(); + auto pull_double = [&](const char *name, double fallback) -> double { + PyObject *v = PyObject_GetAttrString(aggregation_config_obj, name); + if (!v || v == Py_None) { + Py_XDECREF(v); + PyErr_Clear(); + return fallback; + } + double out = PyFloat_AsDouble(v); + Py_DECREF(v); + if (out == -1.0 && PyErr_Occurred()) return fallback; + return out; + }; + auto pull_bool = [&](const char *name, bool fallback) -> bool { + PyObject *v = PyObject_GetAttrString(aggregation_config_obj, name); + if (!v || v == Py_None) { + Py_XDECREF(v); + PyErr_Clear(); + return fallback; + } + int out = PyObject_IsTrue(v); + Py_DECREF(v); + return out > 0 ? true : fallback; + }; + auto pull_string_list = + [&](const char *name) -> std::vector { + std::vector out; + PyObject *v = PyObject_GetAttrString(aggregation_config_obj, name); + if (!v || v == Py_None) { + Py_XDECREF(v); + PyErr_Clear(); + return out; + } + PyObject *seq = PySequence_Fast(v, "expected list of str"); + Py_DECREF(v); + if (!seq) { + PyErr_Clear(); + return out; + } + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + out.reserve(n); + for (Py_ssize_t i = 0; i < n; ++i) { + const char *s = + PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i)); + if (s) out.emplace_back(s); + } + Py_DECREF(seq); + return out; + }; + double time_interval_ms = pull_double("time_interval_ms", 5000.0); + cfg->time_interval_us = + static_cast(time_interval_ms * 1000.0); + cfg->compute_percentiles = pull_bool("compute_percentiles", false); + cfg->extra_group_keys = pull_string_list("group_keys"); + cfg->custom_metric_fields = pull_string_list("custom_metric_fields"); + agg_config_ptr = std::move(cfg); + } + + // owned_member_maps must outlive rt->submit: FileSlice::members is raw. + std::vector> owned_member_maps; + std::vector parsed_slices; + if (file_slices_obj && file_slices_obj != Py_None) { + PyObject *seq = + PySequence_Fast(file_slices_obj, "file_slices must be a sequence"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + if (static_cast(n) != files.size()) { + Py_DECREF(seq); + PyErr_SetString(PyExc_ValueError, + "file_slices must match files length"); + return NULL; + } + owned_member_maps.resize(n); + parsed_slices.resize(n); + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject *entry = PySequence_Fast_GET_ITEM(seq, i); + if (entry == Py_None) { + continue; // leave slice default-constructed (members=null) + } + Py_ssize_t mb = 0, me = 0, ckpt_base = 0; + int skip_scoped = 0; + PyObject *members_obj = nullptr; + if (!PyArg_ParseTuple(entry, "nnnpO", &mb, &me, &ckpt_base, + &skip_scoped, &members_obj)) { + Py_DECREF(seq); + return NULL; + } + PyObject *mseq = PySequence_Fast( + members_obj, "file_slices[i].members must be a sequence"); + if (!mseq) { + Py_DECREF(seq); + return NULL; + } + Py_ssize_t mn = PySequence_Fast_GET_SIZE(mseq); + auto &mv = owned_member_maps[i]; + mv.resize(mn); + for (Py_ssize_t j = 0; j < mn; ++j) { + PyObject *m = PySequence_Fast_GET_ITEM(mseq, j); + unsigned long long c_offset = 0, c_size = 0; + if (!PyArg_ParseTuple(m, "KK", &c_offset, &c_size)) { + Py_DECREF(mseq); + Py_DECREF(seq); + return NULL; + } + mv[j].c_offset = static_cast(c_offset); + mv[j].c_size = static_cast(c_size); + } + Py_DECREF(mseq); + parsed_slices[i].members = &mv; + parsed_slices[i].member_begin = static_cast(mb); + parsed_slices[i].member_end = static_cast(me); + parsed_slices[i].checkpoint_idx_base = + static_cast(ckpt_base); + parsed_slices[i].skip_file_scoped_writes = skip_scoped != 0; + } + Py_DECREF(seq); + } + + auto batch_config = std::make_shared(); + batch_config->file_paths = std::move(files); + batch_config->preassigned_file_ids = std::move(file_ids); + if (!parsed_slices.empty()) { + batch_config->file_slices = parsed_slices; + } + batch_config->index_dir = index_dir; + batch_config->checkpoint_size = static_cast(checkpoint_size); + batch_config->build_manifest = build_manifest != 0; + batch_config->force_rebuild = force_rebuild != 0; + batch_config->bloom_dimensions = std::move(bloom_dims); + batch_config->parallelism = + parallelism > 0 ? static_cast(parallelism) + : (rt ? std::max(rt->threads(), 1) : 1); + batch_config->flush_every_files = + static_cast(flush_every_files); + batch_config->rebuild_root_summaries = false; + + if (agg_config_ptr) { + auto agg_staging = staging; + auto agg_prefix = batch + "_agg"; + // Counter keeps per-file SST dirs unique across duplicate file_paths + // when one worker owns multiple slices of the same file. + auto visitor_counter = std::make_shared>(0); + batch_config->dft_visitor_factory = + [agg_staging, agg_prefix, agg_config_ptr, + visitor_counter](const std::string &file_path) + -> std::vector> { + using dftracer::utils::utilities::composites::dft::DftEventVisitor; + using dftracer::utils::utilities::composites::dft::aggregators:: + AggregationVisitor; + const std::size_t idx = + visitor_counter->fetch_add(1, std::memory_order_relaxed); + std::string prefix = agg_prefix + "_" + std::to_string(idx); + std::vector> visitors; + visitors.push_back(std::make_unique( + agg_staging, prefix, /*config_hash=*/0, *agg_config_ptr, + file_path)); + return visitors; + }; + } + + // Atomic: write phase calls sink_factory from N coroutines concurrently. + auto batch_counter = std::make_shared>(0); + batch_config->sink_factory = + [staging, batch, batch_counter]() -> std::unique_ptr { + const std::size_t idx = + batch_counter->fetch_add(1, std::memory_order_relaxed); + std::string sub_batch = batch + "_" + std::to_string(idx); + return std::make_unique(staging, + sub_batch); + }; + batch_config->sink_commit = [artifacts](IndexBatchSink &sink) { + auto &sst = static_cast(sink); + auto batch_artifacts = sst.commit(); + std::lock_guard lock(artifacts->mu); + if (!batch_artifacts.empty()) { + artifacts->list.push_back(std::move(batch_artifacts)); + } + }; + + IndexBuildBatchResult result; + std::string submit_error; + Py_BEGIN_ALLOW_THREADS try { + rt->submit(dftracer::utils::run_coro_scope( + rt->executor(), + [](dftracer::utils::CoroScope &scope, + std::shared_ptr cfg, + IndexBuildBatchResult *out) + -> dftracer::utils::coro::CoroTask { + *out = co_await IndexBatchBuilderUtility::process( + &scope, std::move(cfg)); + }, + batch_config, &result), + "build-sst-batch") + .get(); + } catch (const std::exception &e) { + submit_error = e.what(); + } + Py_END_ALLOW_THREADS if (!submit_error.empty()) { + PyErr_SetString(PyExc_RuntimeError, submit_error.c_str()); + return NULL; + } + + // If any file failed, surface the first error. + if (result.failed > 0) { + for (const auto &r : result.results) { + if (!r.success) { + PyErr_SetString(PyExc_RuntimeError, r.error_message.c_str()); + return NULL; + } + } + } + + // One dict per committed sink + per-file aggregation below. + PyObject *out_list = PyList_New(0); + if (!out_list) return NULL; + { + std::lock_guard lock(artifacts->mu); + for (const auto &a : artifacts->list) { + PyObject *main_dict = artifacts_to_dict(a); + if (!main_dict || PyList_Append(out_list, main_dict) < 0) { + Py_XDECREF(main_dict); + Py_DECREF(out_list); + return NULL; + } + Py_DECREF(main_dict); + } + } + // Harvest per-file aggregation SSTs from extra visitors. Each visitor + // holds a vector of Artifacts (one per FLUSH_THRESHOLD flush + the + // file-complete flush) because SstFileWriter requires strictly + // ascending keys per SST and cross-flush merge operands would collide. + // + // `extra_visitors` is indexed per input file, but a single + // AggregationVisitor instance is typically shared across every file in + // the batch (one flush at end-of-batch). Without dedup we would emit + // that visitor's artifact dict N_files times, producing a manifest with + // N copies of the same SST path. Dedup by visitor pointer so each + // unique flush-sequence is emitted exactly once. + using dftracer::utils::utilities::composites::dft::aggregators:: + AggregationVisitor; + using dftracer::utils::utilities::composites::dft::aggregators:: + AssociationTracker; + std::unordered_set seen_visitors; + for (auto &file_visitors : result.extra_visitors) { + for (auto &visitor : file_visitors) { + auto *agg = dynamic_cast(visitor.get()); + if (!agg) continue; + if (!seen_visitors.insert(agg).second) continue; + for (auto &a : agg->aggregation_artifacts()) { + if (a.empty()) continue; + PyObject *agg_dict = artifacts_to_dict(a); + if (!agg_dict || PyList_Append(out_list, agg_dict) < 0) { + Py_XDECREF(agg_dict); + Py_DECREF(out_list); + return NULL; + } + Py_DECREF(agg_dict); + } + } + } + + AssociationTracker combined; + bool any_tracker = false; + for (auto *agg : seen_visitors) { + auto out = agg->take_output(); + if (out.local_tracker) { + combined.merge(*out.local_tracker); + any_tracker = true; + } + } + PyObject *tracker_bytes = nullptr; + if (any_tracker) { + combined.finalize(); + std::string blob = combined.serialize(); + tracker_bytes = PyBytes_FromStringAndSize( + blob.data(), static_cast(blob.size())); + } else { + tracker_bytes = PyBytes_FromStringAndSize(nullptr, 0); + } + if (!tracker_bytes) { + Py_DECREF(out_list); + return NULL; + } + PyObject *ret = PyTuple_Pack(2, out_list, tracker_bytes); + Py_DECREF(out_list); + Py_DECREF(tracker_bytes); + return ret; +} + +static PyObject *enable_aggregation_deterministic_ids_fn(PyObject * /*self*/, + PyObject * /*args*/) { + dftracer::utils::utilities::composites::dft::aggregators:: + aggregation_intern() + .enable_deterministic_ids(); + Py_RETURN_NONE; +} + +static PyObject *move_artifacts_fn(PyObject * /*self*/, PyObject *args, + PyObject *kwds) { + static const char *kwlist[] = {"artifacts", "dest_dir", NULL}; + PyObject *dict = NULL; + const char *dest_dir = NULL; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "Os", (char **)kwlist, &dict, + &dest_dir)) { + return NULL; + } + IndexDatabaseSstWriterContext::Artifacts a; + if (!artifacts_from_dict(dict, &a)) return NULL; + IndexDatabaseSstWriterContext::Artifacts moved; + try { + Py_BEGIN_ALLOW_THREADS moved = std::move(a).move_to(dest_dir); + Py_END_ALLOW_THREADS + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + return artifacts_to_dict(moved); +} + +namespace { + +dftracer::utils::coro::CoroTask scan_one_gzip_file( + std::string path, std::vector *out) { + out->clear(); + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) co_return; + struct stat st; + if (::fstat(fd, &st) == 0 && st.st_size >= 18) { + co_await enumerate_gzip_member_candidates( + fd, static_cast(st.st_size), *out); + } + ::close(fd); +} + +} // namespace + +static PyObject *enumerate_gzip_members_fn(PyObject * /*self*/, PyObject *args, + PyObject *kwds) { + static const char *kwlist[] = {"files", "runtime", NULL}; + PyObject *files_obj = NULL; + PyObject *runtime_arg = NULL; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", (char **)kwlist, + &files_obj, &runtime_arg)) { + return NULL; + } + + std::vector files; + { + PyObject *seq = PySequence_Fast(files_obj, "files must be a sequence"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + files.reserve(n); + for (Py_ssize_t i = 0; i < n; ++i) { + const char *s = PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i)); + if (!s) { + Py_DECREF(seq); + return NULL; + } + files.emplace_back(s); + } + Py_DECREF(seq); + } + + Runtime *rt = nullptr; + if (runtime_arg && runtime_arg != Py_None) { + if (PyObject_TypeCheck(runtime_arg, &RuntimeType)) { + rt = ((RuntimeObject *)runtime_arg)->runtime.get(); + } else { + PyObject *native = PyObject_GetAttrString(runtime_arg, "_native"); + if (!native || !PyObject_TypeCheck(native, &RuntimeType)) { + Py_XDECREF(native); + PyErr_SetString(PyExc_TypeError, + "runtime must be a Runtime instance or None"); + return NULL; + } + rt = ((RuntimeObject *)native)->runtime.get(); + Py_DECREF(native); + } + } else { + rt = get_default_runtime(); + } + + std::vector> results(files.size()); + std::string submit_error; + Py_BEGIN_ALLOW_THREADS try { + rt->submit( + dftracer::utils::run_coro_scope( + rt->executor(), + [](dftracer::utils::CoroScope &scope, + const std::vector *paths, + std::vector> *out) + -> dftracer::utils::coro::CoroTask { + co_await scope.scope( + [paths, out](dftracer::utils::CoroScope &child) + -> dftracer::utils::coro::CoroTask { + for (std::size_t i = 0; i < paths->size(); ++i) { + const std::string &path = (*paths)[i]; + auto *slot = &(*out)[i]; + child.spawn( + [path, slot](dftracer::utils::CoroScope &) + -> dftracer::utils::coro::CoroTask< + void> { + co_await scan_one_gzip_file(path, + slot); + }); + } + co_return; + }); + co_return; + }, + &files, &results), + "enumerate-gzip-members") + .get(); + } catch (const std::exception &e) { + submit_error = e.what(); + } + Py_END_ALLOW_THREADS if (!submit_error.empty()) { + PyErr_SetString(PyExc_RuntimeError, submit_error.c_str()); + return NULL; + } + + PyObject *out_list = PyList_New(static_cast(results.size())); + if (!out_list) return NULL; + for (std::size_t i = 0; i < results.size(); ++i) { + const auto &mv = results[i]; + PyObject *inner = PyList_New(static_cast(mv.size())); + if (!inner) { + Py_DECREF(out_list); + return NULL; + } + for (std::size_t j = 0; j < mv.size(); ++j) { + PyObject *t = + Py_BuildValue("(KK)", (unsigned long long)mv[j].c_offset, + (unsigned long long)mv[j].c_size); + if (!t) { + Py_DECREF(inner); + Py_DECREF(out_list); + return NULL; + } + PyList_SET_ITEM(inner, j, t); + } + PyList_SET_ITEM(out_list, i, inner); + } + return out_list; +} + +// Mirrors build_work_units + lpt_assign_units in dftracer_aggregator_mpi.cpp +// so the Dask backend produces identical work distribution to MPI. +static PyObject *plan_work_units_fn(PyObject * /*self*/, PyObject *args, + PyObject *kwds) { + static const char *kwlist[] = {"member_map", "num_workers", "target_c_size", + NULL}; + PyObject *map_obj = NULL; + Py_ssize_t num_workers = 0; + unsigned long long target_c_size = 0; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "On|K", (char **)kwlist, + &map_obj, &num_workers, &target_c_size)) { + return NULL; + } + if (num_workers <= 0) num_workers = 1; + + std::vector> member_map; + { + PyObject *seq = + PySequence_Fast(map_obj, "member_map must be a sequence"); + if (!seq) return NULL; + Py_ssize_t n = PySequence_Fast_GET_SIZE(seq); + member_map.resize(n); + for (Py_ssize_t i = 0; i < n; ++i) { + PyObject *inner = PySequence_Fast_GET_ITEM(seq, i); + PyObject *iseq = + PySequence_Fast(inner, "member_map[i] must be a sequence"); + if (!iseq) { + Py_DECREF(seq); + return NULL; + } + Py_ssize_t ni = PySequence_Fast_GET_SIZE(iseq); + member_map[i].resize(ni); + for (Py_ssize_t j = 0; j < ni; ++j) { + PyObject *t = PySequence_Fast_GET_ITEM(iseq, j); + unsigned long long c_offset = 0, c_size = 0; + if (!PyArg_ParseTuple(t, "KK", &c_offset, &c_size)) { + Py_DECREF(iseq); + Py_DECREF(seq); + return NULL; + } + member_map[i][j].c_offset = + static_cast(c_offset); + member_map[i][j].c_size = static_cast(c_size); + } + Py_DECREF(iseq); + } + Py_DECREF(seq); + } + + // Fallback: treat empty/non-gzip files as a single whole-file member. + std::uint64_t total_c = 0; + for (auto &mv : member_map) { + if (mv.empty()) mv.push_back({0, 0}); + for (const auto &m : mv) total_c += m.c_size; + } + + if (target_c_size == 0) { + target_c_size = + (total_c + static_cast(num_workers) - 1) / + std::max(static_cast(num_workers), 1); + } + + struct Unit { + std::size_t file_idx; + std::size_t member_begin; + std::size_t member_end; + std::uint64_t c_size; + }; + std::vector units; + for (std::size_t fi = 0; fi < member_map.size(); ++fi) { + const auto &members = member_map[fi]; + if (members.empty()) continue; + std::size_t begin = 0; + std::uint64_t accum = 0; + for (std::size_t i = 0; i < members.size(); ++i) { + accum += members[i].c_size; + const bool is_last = (i + 1 == members.size()); + if ((target_c_size > 0 && accum >= target_c_size) || is_last) { + units.push_back({fi, begin, i + 1, accum}); + begin = i + 1; + accum = 0; + } + } + } + + std::vector order(units.size()); + for (std::size_t i = 0; i < order.size(); ++i) order[i] = i; + std::sort(order.begin(), order.end(), [&](std::size_t a, std::size_t b) { + if (units[a].c_size != units[b].c_size) + return units[a].c_size > units[b].c_size; + if (units[a].file_idx != units[b].file_idx) + return units[a].file_idx < units[b].file_idx; + return units[a].member_begin < units[b].member_begin; + }); + const std::size_t nw = static_cast(num_workers); + std::vector loads(nw, 0); + std::vector> per_worker(nw); + for (std::size_t ord : order) { + std::size_t best = 0; + for (std::size_t r = 1; r < nw; ++r) + if (loads[r] < loads[best]) best = r; + per_worker[best].push_back(ord); + loads[best] += std::max(units[ord].c_size, 1); + } + + PyObject *out = PyList_New(static_cast(nw)); + if (!out) return NULL; + for (std::size_t w = 0; w < nw; ++w) { + // Keep per-worker slices sorted by (file_idx, member_begin) for + // deterministic, file-group-friendly iteration downstream. + auto &lst = per_worker[w]; + std::sort(lst.begin(), lst.end(), [&](std::size_t a, std::size_t b) { + if (units[a].file_idx != units[b].file_idx) + return units[a].file_idx < units[b].file_idx; + return units[a].member_begin < units[b].member_begin; + }); + PyObject *inner = PyList_New(static_cast(lst.size())); + if (!inner) { + Py_DECREF(out); + return NULL; + } + for (std::size_t k = 0; k < lst.size(); ++k) { + const auto &u = units[lst[k]]; + PyObject *t = Py_BuildValue( + "(nnnK)", (Py_ssize_t)u.file_idx, (Py_ssize_t)u.member_begin, + (Py_ssize_t)u.member_end, (unsigned long long)u.c_size); + if (!t) { + Py_DECREF(inner); + Py_DECREF(out); + return NULL; + } + PyList_SET_ITEM(inner, k, t); + } + PyList_SET_ITEM(out, w, inner); + } + return out; +} + +// --------------------------------------------------------------------------- +// Module registration +// --------------------------------------------------------------------------- + +static PyMethodDef SstDistributionMethods[] = { + {"build_sst_batch", (PyCFunction)build_sst_batch_fn, + METH_VARARGS | METH_KEYWORDS, + "build_sst_batch(files, file_ids, staging_dir, batch_id, ...) " + "-> (list[dict], bytes)\n" + "Run the indexer pipeline with an SST sink and return " + "(artifact_dicts, tracker_blob). The tracker blob is the serialized " + "merged AssociationTracker from this batch's aggregation visitors " + "(empty bytes when no aggregation_config was passed)."}, + {"plan_lpt_partition", (PyCFunction)plan_lpt_partition_fn, METH_VARARGS, + "plan_lpt_partition(entries, num_workers) -> list[list[(path, size)]]\n" + "Greedy Longest-Processing-Time-first bin-packing of (path, size) " + "tuples across num_workers buckets. Minimises the maximum per-worker " + "total size."}, + {"scan_files", (PyCFunction)scan_files_fn, METH_VARARGS | METH_KEYWORDS, + "scan_files(directory, patterns=None, recursive=False, runtime=None) " + "-> list[(path, size)]\n" + "Parallel directory scan returning (path, size) tuples for regular " + "files matching the patterns."}, + {"enable_aggregation_deterministic_ids", + (PyCFunction)enable_aggregation_deterministic_ids_fn, METH_NOARGS, + "enable_aggregation_deterministic_ids() -> None\n" + "Flip the global aggregation StringIntern into deterministic-id mode " + "so the same string maps to the same 32-bit id in every worker " + "process. Call once at worker startup BEFORE any aggregation work."}, + {"move_artifacts", (PyCFunction)move_artifacts_fn, + METH_VARARGS | METH_KEYWORDS, + "move_artifacts(artifacts, dest_dir) -> dict\n" + "Move every populated SST in `artifacts` (as returned by " + "`build_sst_batch`) into `dest_dir` via the C++ rename/copy helper, " + "returning a fresh dict with the new paths. Single GIL release, no " + "per-file Python shutil.move overhead."}, + {"enumerate_gzip_members", (PyCFunction)enumerate_gzip_members_fn, + METH_VARARGS | METH_KEYWORDS, + "enumerate_gzip_members(files, runtime=None) -> list[list[(c_offset, " + "c_size)]]\n" + "Cooperative async scan of gzip member offsets across `files`. " + "Returns lists of (c_offset, c_size) parallel to `files`; empty for " + "non-gzip / unreadable files."}, + {"plan_work_units", (PyCFunction)plan_work_units_fn, + METH_VARARGS | METH_KEYWORDS, + "plan_work_units(member_map, num_workers, target_c_size=0) " + "-> list[list[(file_idx, member_begin, member_end, c_size)]]\n" + "Deterministic LPT assignment of intra-file gzip-member slices " + "across workers. Each worker's list contains (file_idx, " + "member_begin, member_end, c_size) tuples; a file sliced across " + "multiple workers appears in each owner's list with disjoint " + "[member_begin, member_end) ranges."}, + {NULL, NULL, 0, NULL}}; + +int init_sst_distribution(PyObject *m) { + if (PyType_Ready(&SstArtifactRegistryType) < 0) return -1; + Py_INCREF(&SstArtifactRegistryType); + if (PyModule_AddObject(m, "SstArtifactRegistry", + (PyObject *)&SstArtifactRegistryType) < 0) { + Py_DECREF(&SstArtifactRegistryType); + return -1; + } + if (PyModule_AddFunctions(m, SstDistributionMethods) < 0) return -1; + return 0; +} diff --git a/src/dftracer/utils/python/sst_distribution.h b/src/dftracer/utils/python/sst_distribution.h new file mode 100644 index 00000000..05e9e099 --- /dev/null +++ b/src/dftracer/utils/python/sst_distribution.h @@ -0,0 +1,18 @@ +#ifndef DFTRACER_UTILS_PYTHON_SST_DISTRIBUTION_H +#define DFTRACER_UTILS_PYTHON_SST_DISTRIBUTION_H + +#include + +namespace dftracer::utils::utilities::indexer { +class SstArtifactRegistry; +} + +/// Extract the owned C++ SstArtifactRegistry from a Python +/// SstArtifactRegistry instance. Returns NULL (without setting an error) +/// if `obj` is not an SstArtifactRegistry. +dftracer::utils::utilities::indexer::SstArtifactRegistry * +sst_artifact_registry_get(PyObject *obj); + +int init_sst_distribution(PyObject *m); + +#endif // DFTRACER_UTILS_PYTHON_SST_DISTRIBUTION_H diff --git a/src/dftracer/utils/python/streaming_iterator.cpp b/src/dftracer/utils/python/streaming_iterator.cpp new file mode 100644 index 00000000..be0d6571 --- /dev/null +++ b/src/dftracer/utils/python/streaming_iterator.cpp @@ -0,0 +1,168 @@ +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW + +#define PY_SSIZE_T_CLEAN +#include +#include +#include + +namespace dftracer::utils::python { + +static PyObject* ArrowStreamingIterator_new(PyTypeObject* type, + PyObject* /*args*/, + PyObject* /*kwds*/) { + ArrowStreamingIteratorObject* self = + (ArrowStreamingIteratorObject*)type->tp_alloc(type, 0); + if (self) { + // Allocate C++ state separately to avoid layout issues + self->cpp_state = new ArrowStreamingIteratorState(); + } + return (PyObject*)self; +} + +static void ArrowStreamingIterator_dealloc(ArrowStreamingIteratorObject* self) { + if (self->cpp_state) { + // Cancel the stream if still running + if (self->cpp_state->cancel) { + self->cpp_state->cancel(); + } + delete self->cpp_state; + self->cpp_state = nullptr; + } + Py_TYPE(self)->tp_free((PyObject*)self); +} + +static PyObject* ArrowStreamingIterator_iter(PyObject* self) { + Py_INCREF(self); + return self; +} + +static PyObject* ArrowStreamingIterator_next( + ArrowStreamingIteratorObject* self) { + if (!self->cpp_state || !self->cpp_state->pull_next) { + PyErr_SetString(PyExc_RuntimeError, "Iterator not initialized"); + return NULL; + } + + std::optional result; + bool had_error = false; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + result = self->cpp_state->pull_next(); + } catch (const std::exception& e) { + had_error = true; + error_msg = e.what(); + } catch (...) { + had_error = true; + error_msg = "Unknown error in streaming iterator"; + } + Py_END_ALLOW_THREADS + + if (had_error) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return NULL; + } + + if (!result.has_value()) { + // Check for error + if (self->cpp_state->get_error) { + auto ex = self->cpp_state->get_error(); + if (ex) { + try { + std::rethrow_exception(ex); + } catch (const std::exception& e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } catch (...) { + PyErr_SetString(PyExc_RuntimeError, + "Unknown error in streaming iterator"); + return NULL; + } + } + } + // Normal completion + return NULL; // StopIteration + } + + // Wrap the ArrowExportResult in an ArrowBatchCapsule + ArrowBatchCapsuleObject* obj = + (ArrowBatchCapsuleObject*)ArrowBatchCapsuleType.tp_alloc( + &ArrowBatchCapsuleType, 0); + if (!obj) return NULL; + obj->result = new ArrowExportResult(std::move(*result)); + return (PyObject*)obj; +} + +static PyObject* ArrowStreamingIterator_cancel( + ArrowStreamingIteratorObject* self, PyObject* Py_UNUSED(args)) { + if (self->cpp_state && self->cpp_state->cancel) { + self->cpp_state->cancel(); + } + Py_RETURN_NONE; +} + +static PyMethodDef ArrowStreamingIterator_methods[] = { + {"cancel", (PyCFunction)ArrowStreamingIterator_cancel, METH_NOARGS, + "Cancel the streaming iterator."}, + {NULL}}; + +PyTypeObject ArrowStreamingIteratorType = { + PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext._ArrowStreamingIterator", + sizeof(ArrowStreamingIteratorObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)ArrowStreamingIterator_dealloc, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_as_async */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Streaming Arrow batch iterator.\n\n" + "Yields ArrowBatch objects as they become available from the C++ " + "pipeline.\n" + "Call cancel() to stop the stream early.", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + ArrowStreamingIterator_iter, /* tp_iter */ + (iternextfunc)ArrowStreamingIterator_next, /* tp_iternext */ + ArrowStreamingIterator_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + ArrowStreamingIterator_new, /* tp_new */ +}; + +int init_arrow_streaming_iterator(PyObject* m) { + if (PyType_Ready(&ArrowStreamingIteratorType) < 0) return -1; + + Py_INCREF(&ArrowStreamingIteratorType); + if (PyModule_AddObject(m, "_ArrowStreamingIterator", + (PyObject*)&ArrowStreamingIteratorType) < 0) { + Py_DECREF(&ArrowStreamingIteratorType); + return -1; + } + + return 0; +} + +} // namespace dftracer::utils::python + +#endif // DFTRACER_UTILS_ENABLE_ARROW diff --git a/src/dftracer/utils/python/streaming_iterator.h b/src/dftracer/utils/python/streaming_iterator.h new file mode 100644 index 00000000..cca32b13 --- /dev/null +++ b/src/dftracer/utils/python/streaming_iterator.h @@ -0,0 +1,166 @@ +#ifndef DFTRACER_UTILS_PYTHON_STREAMING_ITERATOR_H +#define DFTRACER_UTILS_PYTHON_STREAMING_ITERATOR_H + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DFTRACER_UTILS_ENABLE_ARROW +#include +#endif + +namespace dftracer::utils::python { + +/// Generic streaming state for bridging C++ async producers to Python sync +/// consumers. +/// +/// Producer (C++ coroutine on Runtime executor): +/// - Calls push() to enqueue items +/// - Calls complete() when done +/// - Calls fail() on error +/// +/// Consumer (Python tp_iternext): +/// - Calls pull() which blocks (with GIL released) until item available +/// - Returns std::nullopt on completion or error +template +class StreamingState { + public: + explicit StreamingState(std::size_t memory_budget_bytes) + : memory_budget_bytes_(memory_budget_bytes) {} + + bool push(ItemT item, std::size_t item_bytes) { + std::unique_lock lock(mtx_); + cv_producer_.wait(lock, [this] { + return bytes_in_queue_.load(std::memory_order_acquire) < + memory_budget_bytes_ || + cancelled_.load(std::memory_order_acquire); + }); + if (cancelled_.load(std::memory_order_acquire)) { + return false; + } + bytes_in_queue_.fetch_add(item_bytes, std::memory_order_acq_rel); + queue_.push({std::move(item), item_bytes}); + lock.unlock(); + cv_consumer_.notify_one(); + return true; + } + + void complete() { + { + std::lock_guard lock(mtx_); + done_.store(true, std::memory_order_release); + } + cv_consumer_.notify_all(); + } + + void fail(std::exception_ptr ex) { + { + std::lock_guard lock(mtx_); + error_ = std::move(ex); + done_.store(true, std::memory_order_release); + } + cv_consumer_.notify_all(); + } + + void cancel() { + cancelled_.store(true, std::memory_order_release); + cv_producer_.notify_all(); + cv_consumer_.notify_all(); + } + + std::optional pull() { + std::unique_lock lock(mtx_); + cv_consumer_.wait(lock, [this] { + return !queue_.empty() || + cancelled_.load(std::memory_order_acquire) || + done_.load(std::memory_order_acquire); + }); + + if (cancelled_.load(std::memory_order_acquire) && queue_.empty()) { + return std::nullopt; + } + + if (queue_.empty()) { + return std::nullopt; + } + + auto [item, size] = std::move(queue_.front()); + queue_.pop(); + bytes_in_queue_.fetch_sub(size, std::memory_order_acq_rel); + lock.unlock(); + cv_producer_.notify_one(); + return std::move(item); + } + + std::exception_ptr error() const { return error_; } + + bool cancelled() const { + return cancelled_.load(std::memory_order_acquire); + } + + bool done() const { return done_.load(std::memory_order_acquire); } + + void set_task_future(std::shared_future future) { + task_future_ = std::move(future); + } + + private: + struct QueueEntry { + ItemT item; + std::size_t size; + }; + std::queue queue_; + std::mutex mtx_; + std::condition_variable cv_producer_; + std::condition_variable cv_consumer_; + std::exception_ptr error_; + std::atomic cancelled_{false}; + std::atomic done_{false}; + std::size_t memory_budget_bytes_; + std::atomic bytes_in_queue_{0}; + std::shared_future task_future_; +}; + +#ifdef DFTRACER_UTILS_ENABLE_ARROW + +using utilities::common::arrow::ArrowExportResult; + +/// Internal C++ state for ArrowStreamingIterator. +/// Stored as a pointer to avoid C++ object layout issues with Python. +struct ArrowStreamingIteratorState { + std::shared_ptr state; + std::function()> pull_next; + std::function get_error; + std::function cancel; +}; + +/// Type-erased Arrow streaming iterator for Python. +/// +/// This allows different producer types (AggregationBatch, ArrowExportResult, +/// etc.) to share the same Python iterator mechanics. +struct ArrowStreamingIteratorObject { + PyObject_HEAD + + /// Pointer to C++ state (owned, allocated with new). + ArrowStreamingIteratorState* cpp_state; +}; + +extern PyTypeObject ArrowStreamingIteratorType; + +/// Initialize the ArrowStreamingIteratorType. +int init_arrow_streaming_iterator(PyObject* m); + +#endif // DFTRACER_UTILS_ENABLE_ARROW + +} // namespace dftracer::utils::python + +#endif // DFTRACER_UTILS_PYTHON_STREAMING_ITERATOR_H diff --git a/src/dftracer/utils/python/trace_reader.cpp b/src/dftracer/utils/python/trace_reader.cpp index f50c0e33..dce2288f 100644 --- a/src/dftracer/utils/python/trace_reader.cpp +++ b/src/dftracer/utils/python/trace_reader.cpp @@ -1,116 +1,566 @@ #define PY_SSIZE_T_CLEAN #include +#include +#include +#include +#include #include +#include +#include #include #include +#include +#include #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include -#include #include #include #include #include #include #include +#include #include - #ifdef DFTRACER_UTILS_ENABLE_ARROW +#include #include -#include +#include +#endif +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +#include +#include +#include +#include +#include +#include +#include #endif namespace { +using dftracer::utils::CoroScope; using dftracer::utils::Runtime; using dftracer::utils::coro::CoroTask; +using dftracer::utils::coro::when_all; +using dftracer::utils::utilities::filesystem::PatternDirectoryScannerUtility; +using dftracer::utils::utilities::filesystem:: + PatternDirectoryScannerUtilityInput; using dftracer::utils::utilities::reader::ReadConfig; using dftracer::utils::utilities::reader::TraceReader; using dftracer::utils::utilities::reader::TraceReaderConfig; +#ifdef DFTRACER_UTILS_ENABLE_ARROW +using dftracer::utils::utilities::common::arrow::ColumnType; +using dftracer::utils::utilities::common::arrow::RecordBatchBuilder; +using dftracer::utils::utilities::common::json::JsonParser; +using dftracer::utils::utilities::common::json::JsonValueHelper; +#endif +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +using dftracer::utils::utilities::common::arrow::IpcCompression; +using dftracer::utils::utilities::common::arrow::PartitionWriter; +using dftracer::utils::utilities::common::arrow::PartitionWriteStats; +using dftracer::utils::utilities::composites::dft::MetadataCollectorUtility; +using dftracer::utils::utilities::composites::dft:: + MetadataCollectorUtilityInput; +using dftracer::utils::utilities::composites::dft::views::ViewBuilderInput; +using dftracer::utils::utilities::composites::dft::views::ViewBuilderUtility; +using dftracer::utils::utilities::composites::dft::views::ViewDefinition; +using dftracer::utils::utilities::composites::dft::views::ViewReaderInput; +using dftracer::utils::utilities::composites::dft::views::ViewReaderUtility; +#endif -int64_t json_to_int64(yyjson_val *value) { - if (yyjson_is_int(value)) return yyjson_get_sint(value); - return static_cast(yyjson_get_uint(value)); -} +using dftracer::utils::python::MemoryViewBatchData; +using dftracer::utils::python::MemoryViewBatchIteratorState; -CoroTask produce_lines(std::shared_ptr state, - TraceReaderConfig cfg, ReadConfig rc) { - auto *sp = state.get(); +CoroTask produce_lines_batched( + std::shared_ptr state, + dftracer::utils::coro::ChannelProducer producer, + TraceReaderConfig cfg, ReadConfig rc, std::size_t batch_size) { + auto guard = producer.guard(); try { TraceReader reader(std::move(cfg)); auto gen = reader.read_lines(rc); + MemoryViewBatchData batch; + std::size_t count = 0; + while (auto opt = co_await gen.next()) { - if (sp->cancelled.load(std::memory_order_acquire)) break; - std::string item(opt->content); - { - std::unique_lock lock(sp->mtx); - sp->cv_producer.wait(lock, [sp] { - return sp->queue.size() < sp->max_queue_size || - sp->cancelled.load(std::memory_order_acquire); - }); - if (sp->cancelled.load(std::memory_order_acquire)) break; - sp->queue.push(std::move(item)); + if (state->cancelled.load(std::memory_order_acquire)) break; + auto sv = opt->content; + Py_ssize_t offset = static_cast(batch.buffer.size()); + batch.buffer.insert(batch.buffer.end(), sv.begin(), sv.end()); + batch.offsets.push_back(offset); + batch.lengths.push_back(static_cast(sv.size())); + ++count; + + if (count >= batch_size) { + auto batch_bytes = dftracer::utils::python::byte_size(batch); + state->bytes_in_queue.fetch_add(batch_bytes, + std::memory_order_acq_rel); + if (!co_await producer.send(std::move(batch))) break; + batch = MemoryViewBatchData{}; + count = 0; } - sp->cv_consumer.notify_one(); + } + if (count > 0 && !state->cancelled.load(std::memory_order_acquire)) { + auto batch_bytes = dftracer::utils::python::byte_size(batch); + state->bytes_in_queue.fetch_add(batch_bytes, + std::memory_order_acq_rel); + co_await producer.send(std::move(batch)); } } catch (...) { - std::lock_guard lock(sp->mtx); - sp->error = std::current_exception(); - sp->queue.push(std::nullopt); - sp->done.store(true, std::memory_order_release); - sp->cv_consumer.notify_one(); - co_return; - } - { - std::lock_guard lock(sp->mtx); - sp->queue.push(std::nullopt); - sp->done.store(true, std::memory_order_release); + state->set_error(std::current_exception()); } - sp->cv_consumer.notify_one(); } -CoroTask produce_raw(std::shared_ptr state, - TraceReaderConfig cfg, ReadConfig rc) { - auto *sp = state.get(); +CoroTask produce_raw_batched( + std::shared_ptr state, + dftracer::utils::coro::ChannelProducer producer, + TraceReaderConfig cfg, ReadConfig rc) { + auto guard = producer.guard(); try { TraceReader reader(std::move(cfg)); auto gen = reader.read_raw(rc); while (auto opt = co_await gen.next()) { - if (sp->cancelled.load(std::memory_order_acquire)) break; - std::string item(opt->data(), opt->size()); - { - std::unique_lock lock(sp->mtx); - sp->cv_producer.wait(lock, [sp] { - return sp->queue.size() < sp->max_queue_size || - sp->cancelled.load(std::memory_order_acquire); - }); - if (sp->cancelled.load(std::memory_order_acquire)) break; - sp->queue.push(std::move(item)); + if (state->cancelled.load(std::memory_order_acquire)) break; + MemoryViewBatchData batch; + batch.buffer.assign(opt->data(), opt->data() + opt->size()); + batch.offsets.push_back(0); + batch.lengths.push_back(static_cast(opt->size())); + auto batch_bytes = dftracer::utils::python::byte_size(batch); + state->bytes_in_queue.fetch_add(batch_bytes, + std::memory_order_acq_rel); + if (!co_await producer.send(std::move(batch))) break; + } + } catch (...) { + state->set_error(std::current_exception()); + } +} + +using dftracer::utils::utilities::common::json::JsonParser; +using dftracer::utils::utilities::common::json::JsonValueHelper; + +static constexpr std::size_t ESTIMATED_BYTES_PER_LINE = 256; +static constexpr std::size_t ESTIMATED_BYTES_PER_RAW_CHUNK = 4 * 1024 * 1024; +static constexpr std::size_t ESTIMATED_BYTES_PER_JSON_EVENT = 512; +static constexpr std::size_t ESTIMATED_BYTES_PER_ARROW_ROW = 1024; + +static void insert_simdjson_value(ArgsMap &map, std::string_view key, + simdjson::ondemand::value val) { + auto type = val.type(); + if (type.error()) return; + switch (type.value_unsafe()) { + case simdjson::ondemand::json_type::string: { + auto r = val.get_string(); + if (!r.error()) map.insert(key, std::string(r.value_unsafe())); + break; + } + case simdjson::ondemand::json_type::number: { + auto ri = val.get_int64(); + if (!ri.error()) { + auto v = ri.value_unsafe(); + if (v >= 0) + map.insert(key, static_cast(v)); + else + map.insert(key, v); + } else { + auto rd = val.get_double(); + if (!rd.error()) map.insert(key, rd.value_unsafe()); + } + break; + } + case simdjson::ondemand::json_type::boolean: { + auto r = val.get_bool(); + if (!r.error()) map.insert(key, r.value_unsafe()); + break; + } + default: + break; + } +} + +static void parse_json_to_event(JsonParser &parser, JsonDictEvent &ev) { + ev.top.set_valid(true); + parser.for_each_field( + [&](std::string_view key, simdjson::ondemand::value val) { + if (key == "args") { + auto obj = val.get_object(); + if (!obj.error()) { + ev.args.set_valid(true); + for (auto field : obj.value_unsafe()) { + if (field.error()) continue; + auto fkey = field.unescaped_key(); + if (fkey.error()) continue; + auto fval = field.value(); + if (fval.error()) continue; + insert_simdjson_value(ev.args, fkey.value_unsafe(), + fval.value_unsafe()); + } + } + } else { + insert_simdjson_value(ev.top, key, val); } - sp->cv_consumer.notify_one(); + }); +} + +CoroTask produce_json_dicts( + std::shared_ptr state, + dftracer::utils::coro::ChannelProducer producer, + TraceReaderConfig cfg, ReadConfig rc, std::size_t batch_size) { + auto guard = producer.guard(); + try { + TraceReader reader(std::move(cfg)); + auto gen = reader.read_json(rc); + JsonDictBatch batch; + batch.events.reserve(batch_size); + + while (auto opt = co_await gen.next()) { + if (state->cancelled.load(std::memory_order_acquire)) break; + + JsonDictEvent ev; + parse_json_to_event(*opt->parser, ev); + batch.events.push_back(std::move(ev)); + + if (batch.events.size() >= batch_size) { + auto batch_bytes = dftracer::utils::python::byte_size(batch); + state->bytes_in_queue.fetch_add(batch_bytes, + std::memory_order_acq_rel); + if (!co_await producer.send(std::move(batch))) break; + batch = JsonDictBatch{}; + batch.events.reserve(batch_size); + } + } + if (!batch.events.empty() && + !state->cancelled.load(std::memory_order_acquire)) { + auto batch_bytes = dftracer::utils::python::byte_size(batch); + state->bytes_in_queue.fetch_add(batch_bytes, + std::memory_order_acq_rel); + co_await producer.send(std::move(batch)); } } catch (...) { - std::lock_guard lock(sp->mtx); - sp->error = std::current_exception(); - sp->queue.push(std::nullopt); - sp->done.store(true, std::memory_order_release); - sp->cv_consumer.notify_one(); - co_return; + state->set_error(std::current_exception()); + } +} + +static CoroTask send_files_to_channel( + std::shared_ptr> file_chan, + const std::vector *files, std::atomic *cancelled) { + for (const auto &fp : *files) { + if (cancelled->load(std::memory_order_acquire)) break; + if (!co_await file_chan->send(fp)) break; + } + file_chan->close(); + co_return; +} + +static CoroTask json_dict_file_worker( + std::shared_ptr> file_chan, + dftracer::utils::coro::Channel *out_chan, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::size_t batch_size, std::atomic *cancelled) { + dftracer::utils::coro::ChannelProducer producer(out_chan); + auto guard = producer.guard(); + + while (auto file_path = co_await file_chan->receive()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + TraceReaderConfig cfg; + cfg.file_path = std::move(*file_path); + cfg.index_dir = index_dir; + cfg.checkpoint_size = checkpoint_size; + cfg.auto_build_index = auto_build_index; + + TraceReader reader(std::move(cfg)); + auto gen = reader.read_json(rc); + JsonDictBatch batch; + batch.events.reserve(batch_size); + + while (auto opt = co_await gen.next()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + JsonDictEvent ev; + parse_json_to_event(*opt->parser, ev); + batch.events.push_back(std::move(ev)); + if (batch.events.size() >= batch_size) { + if (!co_await producer.send(std::move(batch))) co_return; + batch = JsonDictBatch{}; + batch.events.reserve(batch_size); + } + } + if (!batch.events.empty()) { + if (!co_await producer.send(std::move(batch))) co_return; + } + } + co_return; +} + +static CoroTask spawn_json_dict_producers( + CoroScope &child, dftracer::utils::coro::Channel *out_chan, + const std::vector *files, const std::string *index_dir, + std::size_t checkpoint_size, bool auto_build_index, const ReadConfig *rc, + std::size_t batch_size, std::atomic *cancelled_ptr, + std::size_t max_workers) { + std::size_t num_workers = std::min(files->size(), max_workers); + auto file_chan = + dftracer::utils::coro::make_channel(num_workers); + + for (std::size_t i = 0; i < num_workers; ++i) { + child.spawn([out_chan, fc = file_chan, idx = *index_dir, + checkpoint_size, auto_build_index, r = *rc, batch_size, + cancelled_ptr](CoroScope &) { + return json_dict_file_worker(fc, out_chan, idx, checkpoint_size, + auto_build_index, r, batch_size, + cancelled_ptr); + }); + } + + child.spawn([fc = file_chan, files, cancelled_ptr](CoroScope &) { + return send_files_to_channel(fc, files, cancelled_ptr); + }); + co_return; +} + +static CoroTask produce_json_dicts_parallel( + CoroScope &scope, JsonDictIteratorState *sp, std::string dir_path, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::size_t batch_size, std::size_t max_workers) { + try { + PatternDirectoryScannerUtility scanner; + auto scan_input = PatternDirectoryScannerUtilityInput( + dir_path, {".pfw", ".pfw.gz"}, true, false); + auto entries = co_await scope.spawn(scanner, scan_input); + + std::vector files; + files.reserve(entries.size()); + for (auto &e : entries) files.push_back(e.path.string()); + std::sort(files.begin(), files.end()); + + if (files.empty()) { + sp->channel->close(); + co_return; + } + + auto *chan_ptr = sp->channel.get(); + auto *cancelled_ptr = &sp->cancelled; + + co_await scope.scope([chan_ptr, &files, &index_dir, checkpoint_size, + auto_build_index, &rc, batch_size, cancelled_ptr, + max_workers](CoroScope &child) -> CoroTask { + co_await spawn_json_dict_producers( + child, chan_ptr, &files, &index_dir, checkpoint_size, + auto_build_index, &rc, batch_size, cancelled_ptr, max_workers); + }); + } catch (...) { + sp->set_error(std::current_exception()); + } +} + +static CoroTask lines_file_worker( + std::shared_ptr> file_chan, + dftracer::utils::coro::Channel *out_chan, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::size_t batch_size, std::atomic *cancelled) { + dftracer::utils::coro::ChannelProducer producer( + out_chan); + auto guard = producer.guard(); + + while (auto file_path = co_await file_chan->receive()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + TraceReaderConfig cfg; + cfg.file_path = std::move(*file_path); + cfg.index_dir = index_dir; + cfg.checkpoint_size = checkpoint_size; + cfg.auto_build_index = auto_build_index; + + TraceReader reader(std::move(cfg)); + auto gen = reader.read_lines(rc); + MemoryViewBatchData batch; + std::size_t count = 0; + + while (auto opt = co_await gen.next()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + auto sv = opt->content; + Py_ssize_t offset = static_cast(batch.buffer.size()); + batch.buffer.insert(batch.buffer.end(), sv.begin(), sv.end()); + batch.offsets.push_back(offset); + batch.lengths.push_back(static_cast(sv.size())); + ++count; + if (count >= batch_size) { + if (!co_await producer.send(std::move(batch))) co_return; + batch = MemoryViewBatchData{}; + count = 0; + } + } + if (count > 0) { + if (!co_await producer.send(std::move(batch))) co_return; + } + } + co_return; +} + +static CoroTask spawn_lines_producers( + CoroScope &child, + dftracer::utils::coro::Channel *out_chan, + const std::vector *files, const std::string *index_dir, + std::size_t checkpoint_size, bool auto_build_index, const ReadConfig *rc, + std::size_t batch_size, std::atomic *cancelled_ptr, + std::size_t max_workers) { + std::size_t num_workers = std::min(files->size(), max_workers); + auto file_chan = + dftracer::utils::coro::make_channel(num_workers); + + for (std::size_t i = 0; i < num_workers; ++i) { + child.spawn([out_chan, fc = file_chan, idx = *index_dir, + checkpoint_size, auto_build_index, r = *rc, batch_size, + cancelled_ptr](CoroScope &) { + return lines_file_worker(fc, out_chan, idx, checkpoint_size, + auto_build_index, r, batch_size, + cancelled_ptr); + }); + } + + child.spawn([fc = file_chan, files, cancelled_ptr](CoroScope &) { + return send_files_to_channel(fc, files, cancelled_ptr); + }); + co_return; +} + +static CoroTask produce_lines_parallel( + CoroScope &scope, MemoryViewBatchIteratorState *sp, std::string dir_path, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::size_t batch_size, std::size_t max_workers) { + try { + PatternDirectoryScannerUtility scanner; + auto scan_input = PatternDirectoryScannerUtilityInput( + dir_path, {".pfw", ".pfw.gz"}, true, false); + auto entries = co_await scope.spawn(scanner, scan_input); + + std::vector files; + files.reserve(entries.size()); + for (auto &e : entries) files.push_back(e.path.string()); + std::sort(files.begin(), files.end()); + + if (files.empty()) { + sp->channel->close(); + co_return; + } + + auto *chan_ptr = sp->channel.get(); + auto *cancelled_ptr = &sp->cancelled; + + co_await scope.scope([chan_ptr, &files, &index_dir, checkpoint_size, + auto_build_index, &rc, batch_size, cancelled_ptr, + max_workers](CoroScope &child) -> CoroTask { + co_await spawn_lines_producers( + child, chan_ptr, &files, &index_dir, checkpoint_size, + auto_build_index, &rc, batch_size, cancelled_ptr, max_workers); + }); + } catch (...) { + sp->set_error(std::current_exception()); } - { - std::lock_guard lock(sp->mtx); - sp->queue.push(std::nullopt); - sp->done.store(true, std::memory_order_release); +} + +static CoroTask raw_file_worker( + std::shared_ptr> file_chan, + dftracer::utils::coro::Channel *out_chan, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::atomic *cancelled) { + dftracer::utils::coro::ChannelProducer producer( + out_chan); + auto guard = producer.guard(); + + while (auto file_path = co_await file_chan->receive()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + TraceReaderConfig cfg; + cfg.file_path = std::move(*file_path); + cfg.index_dir = index_dir; + cfg.checkpoint_size = checkpoint_size; + cfg.auto_build_index = auto_build_index; + + TraceReader reader(std::move(cfg)); + auto gen = reader.read_raw(rc); + while (auto opt = co_await gen.next()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + MemoryViewBatchData batch; + batch.buffer.assign(opt->data(), opt->data() + opt->size()); + batch.offsets.push_back(0); + batch.lengths.push_back(static_cast(opt->size())); + if (!co_await producer.send(std::move(batch))) co_return; + } + } + co_return; +} + +static CoroTask spawn_raw_producers( + CoroScope &child, + dftracer::utils::coro::Channel *out_chan, + const std::vector *files, const std::string *index_dir, + std::size_t checkpoint_size, bool auto_build_index, const ReadConfig *rc, + std::atomic *cancelled_ptr, std::size_t max_workers) { + std::size_t num_workers = std::min(files->size(), max_workers); + auto file_chan = + dftracer::utils::coro::make_channel(num_workers); + + for (std::size_t i = 0; i < num_workers; ++i) { + child.spawn([out_chan, fc = file_chan, idx = *index_dir, + checkpoint_size, auto_build_index, r = *rc, + cancelled_ptr](CoroScope &) { + return raw_file_worker(fc, out_chan, idx, checkpoint_size, + auto_build_index, r, cancelled_ptr); + }); + } + + child.spawn([fc = file_chan, files, cancelled_ptr](CoroScope &) { + return send_files_to_channel(fc, files, cancelled_ptr); + }); + co_return; +} + +static CoroTask produce_raw_parallel( + CoroScope &scope, MemoryViewBatchIteratorState *sp, std::string dir_path, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::size_t max_workers) { + try { + PatternDirectoryScannerUtility scanner; + auto scan_input = PatternDirectoryScannerUtilityInput( + dir_path, {".pfw", ".pfw.gz"}, true, false); + auto entries = co_await scope.spawn(scanner, scan_input); + + std::vector files; + files.reserve(entries.size()); + for (auto &e : entries) files.push_back(e.path.string()); + std::sort(files.begin(), files.end()); + + if (files.empty()) { + sp->channel->close(); + co_return; + } + + auto *chan_ptr = sp->channel.get(); + auto *cancelled_ptr = &sp->cancelled; + + co_await scope.scope([chan_ptr, &files, &index_dir, checkpoint_size, + auto_build_index, &rc, cancelled_ptr, + max_workers](CoroScope &child) -> CoroTask { + co_await spawn_raw_producers(child, chan_ptr, &files, &index_dir, + checkpoint_size, auto_build_index, &rc, + cancelled_ptr, max_workers); + }); + } catch (...) { + sp->set_error(std::current_exception()); } - sp->cv_consumer.notify_one(); } #ifdef DFTRACER_UTILS_ENABLE_ARROW +using dftracer::utils::utilities::common::arrow::ArrowExportResult; using dftracer::utils::utilities::common::arrow::ColumnType; using dftracer::utils::utilities::common::arrow::RecordBatchBuilder; @@ -229,64 +679,65 @@ static bool str_contains_lower(std::string_view s, const char *needle) { return false; } -// Normalize a raw JSON row (already parsed into yyjson) into the semantic +// Normalize a raw JSON row (parsed with simdjson) into the semantic // output schema. Appends one row to `builder` with the full set of output // columns. Returns false if the row should be skipped (no valid name). static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, - yyjson_val *root) { + JsonParser &parser) { + using SVH = JsonValueHelper; + // --- Extract top-level fields --- - yyjson_val *v_ph = yyjson_obj_get(root, "ph"); - yyjson_val *v_name = yyjson_obj_get(root, "name"); - yyjson_val *v_cat = yyjson_obj_get(root, "cat"); - yyjson_val *v_pid = yyjson_obj_get(root, "pid"); - yyjson_val *v_tid = yyjson_obj_get(root, "tid"); - yyjson_val *v_ts = yyjson_obj_get(root, "ts"); - yyjson_val *v_dur = yyjson_obj_get(root, "dur"); - yyjson_val *v_args = yyjson_obj_get(root, "args"); - - std::string_view ph = - v_ph && yyjson_is_str(v_ph) - ? std::string_view(yyjson_get_str(v_ph), yyjson_get_len(v_ph)) - : std::string_view(); - std::string_view name_sv = - v_name && yyjson_is_str(v_name) - ? std::string_view(yyjson_get_str(v_name), yyjson_get_len(v_name)) - : std::string_view(); - std::string_view cat_sv = - v_cat && yyjson_is_str(v_cat) - ? std::string_view(yyjson_get_str(v_cat), yyjson_get_len(v_cat)) - : std::string_view(); - - // Helper to get args fields - auto args_str = [&](const char *key) -> std::string_view { - if (!v_args) return {}; - yyjson_val *v = yyjson_obj_get(v_args, key); - if (!v) return {}; - if (yyjson_is_str(v)) return {yyjson_get_str(v), yyjson_get_len(v)}; - return {}; - }; - auto args_int = [&](const char *key) -> std::pair { - if (!v_args) return {false, 0}; - yyjson_val *v = yyjson_obj_get(v_args, key); - if (!v) return {false, 0}; - if (yyjson_is_int(v)) return {true, yyjson_get_sint(v)}; - if (yyjson_is_uint(v)) - return {true, static_cast(yyjson_get_uint(v))}; - if (yyjson_is_real(v)) - return {true, static_cast(yyjson_get_real(v))}; - return {false, 0}; - }; - auto args_float = [&](const char *key) -> std::pair { - if (!v_args) return {false, 0.0}; - yyjson_val *v = yyjson_obj_get(v_args, key); - if (!v) return {false, 0.0}; - if (yyjson_is_real(v)) return {true, yyjson_get_real(v)}; - if (yyjson_is_int(v)) - return {true, static_cast(yyjson_get_sint(v))}; - if (yyjson_is_uint(v)) - return {true, static_cast(yyjson_get_uint(v))}; - return {false, 0.0}; - }; + auto ph = parser.get_string("ph").value_or(std::string_view{}); + auto name_sv = parser.get_string("name").value_or(std::string_view{}); + auto cat_sv = parser.get_string("cat").value_or(std::string_view{}); + auto pid_opt = parser.get_int64("pid"); + auto tid_opt = parser.get_int64("tid"); + auto ts_opt = parser.get_int64("ts"); + auto dur_opt = parser.get_int64("dur"); + + // Helper lambdas to access args fields (need to rewind after each access) + // We'll do a single pass over args instead + std::optional args_name, args_value, args_hhash, + args_fhash; + std::optional args_epoch, args_step, args_size_sum, args_ret; + std::optional args_offset, args_image_idx, args_image_size; + std::unordered_map args_int_map; + std::unordered_map args_float_map; + + parser.rewind(); + parser.for_each_field( + "args", [&](std::string_view key, simdjson::ondemand::value val) { + if (key == "name") { + if (auto s = SVH::get_string(val)) args_name = s; + } else if (key == "value") { + if (auto s = SVH::get_string(val)) args_value = s; + } else if (key == "hhash") { + if (auto s = SVH::get_string(val)) args_hhash = s; + } else if (key == "fhash") { + if (auto s = SVH::get_string(val)) args_fhash = s; + } else if (key == "epoch") { + if (auto i = SVH::get_int64(val)) args_epoch = i; + } else if (key == "step") { + if (auto i = SVH::get_int64(val)) args_step = i; + } else if (key == "size_sum") { + if (auto i = SVH::get_int64(val)) args_size_sum = i; + } else if (key == "ret") { + if (auto i = SVH::get_int64(val)) args_ret = i; + } else if (key == "offset") { + if (auto i = SVH::get_int64(val)) args_offset = i; + } else if (key == "image_idx") { + if (auto i = SVH::get_int64(val)) args_image_idx = i; + } else if (key == "image_size") { + if (auto i = SVH::get_int64(val)) args_image_size = i; + } else { + // Store other int/float args for profile/sys columns + if (auto i = SVH::get_int64(val)) { + args_int_map[std::string(key)] = *i; + } else if (auto d = SVH::get_double(val)) { + args_float_map[std::string(key)] = *d; + } + } + }); // --- Type classification --- bool is_M = (ph == "M"); @@ -315,17 +766,12 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, // Name: metadata rows use args.name if available std::string_view out_name = name_sv; - if (is_M) { - auto an = args_str("name"); - if (!an.empty()) out_name = an; + if (is_M && args_name && !args_name->empty()) { + out_name = *args_name; } if (out_name.empty()) return false; // skip rows without name - // --- Declare all output columns (lazy — add_or_get_column handles - // first-time creation) --- We use a fixed schema so column indices are - // stable across rows. The builder backfills nulls for columns not touched - // via end_row(). - + // --- Declare all output columns --- auto ci_type = builder.add_or_get_column("type", ColumnType::INT64); auto ci_cat = builder.add_or_get_column("cat", ColumnType::STRING); auto ci_name = builder.add_or_get_column("name", ColumnType::STRING); @@ -342,7 +788,8 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, auto ci_ts = builder.add_or_get_column("ts", ColumnType::INT64); auto ci_dur = builder.add_or_get_column("dur", ColumnType::INT64); auto ci_te = builder.add_or_get_column("te", ColumnType::INT64); - auto ci_trange = builder.add_or_get_column("trange", ColumnType::INT64); + [[maybe_unused]] auto ci_trange = + builder.add_or_get_column("trange", ColumnType::INT64); auto ci_io_cat = builder.add_or_get_column("io_cat", ColumnType::INT64); auto ci_size = builder.add_or_get_column("size", ColumnType::INT64); auto ci_offset = builder.add_or_get_column("offset", ColumnType::INT64); @@ -351,7 +798,7 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, // --- Populate core columns --- builder.append_int64(ci_type, row_type); - // cat (lowercased) — write into arena + // cat (lowercased) - write into arena if (!cat_sv.empty()) { char lbuf[256]; std::size_t clen = std::min(cat_sv.size(), sizeof(lbuf)); @@ -365,42 +812,36 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, builder.append_string(ci_name, out_name); - if (v_pid && (yyjson_is_int(v_pid) || yyjson_is_uint(v_pid))) - builder.append_int64(ci_pid, json_to_int64(v_pid)); - // else: null via end_row backfill - - if (v_tid && (yyjson_is_int(v_tid) || yyjson_is_uint(v_tid))) - builder.append_int64(ci_tid, json_to_int64(v_tid)); + if (pid_opt) builder.append_int64(ci_pid, *pid_opt); + if (tid_opt) builder.append_int64(ci_tid, *tid_opt); // hash / value - auto a_value = args_str("value"); - if (is_hash && !a_value.empty()) builder.append_string(ci_hash, a_value); - if (row_type == ROW_METADATA && !a_value.empty()) - builder.append_string(ci_value, a_value); + if (is_hash && args_value && !args_value->empty()) + builder.append_string(ci_hash, *args_value); + if (row_type == ROW_METADATA && args_value && !args_value->empty()) + builder.append_string(ci_value, *args_value); // host_hash / file_hash - auto a_hhash = args_str("hhash"); - if (!a_hhash.empty()) builder.append_string(ci_host_hash, a_hhash); - auto a_fhash = args_str("fhash"); - if (!a_fhash.empty()) builder.append_string(ci_file_hash, a_fhash); + if (args_hhash && !args_hhash->empty()) + builder.append_string(ci_host_hash, *args_hhash); + if (args_fhash && !args_fhash->empty()) + builder.append_string(ci_file_hash, *args_fhash); // epoch / step - auto [has_epoch, epoch_v] = args_int("epoch"); - if (has_epoch && epoch_v >= 0) builder.append_int64(ci_epoch, epoch_v); - auto [has_step, step_v] = args_int("step"); - if (has_step && step_v >= 0) builder.append_int64(ci_step, step_v); + if (args_epoch && *args_epoch >= 0) + builder.append_int64(ci_epoch, *args_epoch); + if (args_step && *args_step >= 0) builder.append_int64(ci_step, *args_step); // --- Temporal --- - bool has_ts = (is_event || is_C) && v_ts && - (yyjson_is_int(v_ts) || yyjson_is_uint(v_ts)); - bool has_dur = v_dur && (yyjson_is_int(v_dur) || yyjson_is_uint(v_dur)); + bool has_ts = (is_event || is_C) && ts_opt.has_value(); + bool has_dur = dur_opt.has_value(); int64_t ts_val = 0, dur_val = 0; if (has_ts) { - ts_val = json_to_int64(v_ts); + ts_val = *ts_opt; builder.append_int64(ci_ts, ts_val); } if (is_event && has_ts && has_dur) { - dur_val = json_to_int64(v_dur); + dur_val = *dur_opt; builder.append_int64(ci_dur, dur_val); builder.append_int64(ci_te, ts_val + dur_val); } @@ -412,26 +853,22 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, int8_t io_cat = IO_OTHER; // size priority: size_sum > POSIX ret > image_size - auto [has_ss, ss_val] = args_int("size_sum"); - if (has_ss) { - builder.append_int64(ci_size, ss_val); - if (is_posix_stdio) io_cat = get_io_cat(name_sv); + if (args_size_sum) { + builder.append_int64(ci_size, *args_size_sum); + if (is_posix_stdio) io_cat = get_io_cat(out_name); } else if (is_posix_stdio) { - io_cat = get_io_cat(name_sv); - auto [has_ret, ret_val] = args_int("ret"); - if (has_ret && ret_val > 0 && + io_cat = get_io_cat(out_name); + if (args_ret && *args_ret > 0 && (io_cat == IO_READ || io_cat == IO_WRITE)) - builder.append_int64(ci_size, ret_val); - auto [has_ofs, ofs_val] = args_int("offset"); - if (has_ofs && ofs_val >= 0) - builder.append_int64(ci_offset, ofs_val); + builder.append_int64(ci_size, *args_ret); + if (args_offset && *args_offset >= 0) + builder.append_int64(ci_offset, *args_offset); } else { - auto [has_img, img_val] = args_int("image_idx"); - if (has_img && img_val > 0) - builder.append_int64(ci_image_id, img_val); - auto [has_ims, ims_val] = args_int("image_size"); - if (has_ims && ims_val > 0 && !str_contains_lower(name_sv, "open")) - builder.append_int64(ci_size, ims_val); + if (args_image_idx && *args_image_idx > 0) + builder.append_int64(ci_image_id, *args_image_idx); + if (args_image_size && *args_image_size > 0 && + !str_contains_lower(out_name, "open")) + builder.append_int64(ci_size, *args_image_size); } builder.append_int64(ci_io_cat, io_cat); } @@ -440,7 +877,7 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, if (is_profile) { bool is_posix_stdio = str_iequal(cat_sv, "posix") || str_iequal(cat_sv, "stdio"); - int8_t io_cat = is_posix_stdio ? get_io_cat(name_sv) : IO_OTHER; + int8_t io_cat = is_posix_stdio ? get_io_cat(out_name) : IO_OTHER; builder.append_int64(ci_io_cat, io_cat); static const char *profile_keys[] = { @@ -451,10 +888,10 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, "ret_max", "ret_min", "ret_sum", "whence", "whence_max", "whence_min", "whence_sum", nullptr}; for (const char **pk = profile_keys; *pk; ++pk) { - auto [has_v, val] = args_int(*pk); - if (has_v) { + auto it = args_int_map.find(*pk); + if (it != args_int_map.end()) { auto idx = builder.add_or_get_column(*pk, ColumnType::INT64); - builder.append_int64(idx, val); + builder.append_int64(idx, it->second); } } } @@ -466,10 +903,10 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, "irq_pct", "softirq_pct", "MemAvailable", "MemFree", "Cached", "Dirty", "Active", nullptr}; for (const char **sk = sys_keys; *sk; ++sk) { - auto [has_v, val] = args_float(*sk); - if (has_v) { + auto it = args_float_map.find(*sk); + if (it != args_float_map.end()) { auto idx = builder.add_or_get_column(*sk, ColumnType::DOUBLE); - builder.append_double(idx, val); + builder.append_double(idx, it->second); } } } @@ -478,276 +915,1321 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena, return true; } -// Flatten a yyjson object into "prefix.key" columns using native types. +// Flatten a simdjson object into "prefix.key" columns using native types. // On type mismatch (same key, different type across rows), appends null. static void flatten_object_into(RecordBatchBuilder &builder, StringArena &arena, - std::string_view prefix, yyjson_val *obj) { + std::string_view prefix, + simdjson::ondemand::object obj) { + using SVH = JsonValueHelper; char key_buf[512]; - yyjson_obj_iter sub_iter; - yyjson_obj_iter_init(obj, &sub_iter); - yyjson_val *sub_key; - while ((sub_key = yyjson_obj_iter_next(&sub_iter))) { - yyjson_val *sub_val = yyjson_obj_iter_get_val(sub_key); - const char *sk_str = yyjson_get_str(sub_key); - std::size_t sk_len = yyjson_get_len(sub_key); + for (auto field : obj) { + if (field.error()) continue; - std::size_t needed = prefix.size() + 1 + sk_len; + auto key_result = field.unescaped_key(); + if (key_result.error()) continue; + std::string_view sk = key_result.value_unsafe(); + + auto val_result = field.value(); + if (val_result.error()) continue; + auto sub_val = val_result.value_unsafe(); + + std::size_t needed = prefix.size() + 1 + sk.size(); if (needed >= sizeof(key_buf)) continue; std::memcpy(key_buf, prefix.data(), prefix.size()); key_buf[prefix.size()] = '.'; - std::memcpy(key_buf + prefix.size() + 1, sk_str, sk_len); + std::memcpy(key_buf + prefix.size() + 1, sk.data(), sk.size()); std::string_view full_key(key_buf, needed); - if (yyjson_is_int(sub_val)) { - auto idx = builder.add_or_get_column(full_key, ColumnType::INT64); - if (builder.column_type(idx) == ColumnType::INT64) - builder.append_int64(idx, yyjson_get_sint(sub_val)); - else - builder.append_null(idx); - } else if (yyjson_is_uint(sub_val)) { - auto idx = builder.add_or_get_column(full_key, ColumnType::UINT64); - if (builder.column_type(idx) == ColumnType::UINT64) - builder.append_uint64(idx, yyjson_get_uint(sub_val)); - else - builder.append_null(idx); - } else if (yyjson_is_real(sub_val)) { - auto idx = builder.add_or_get_column(full_key, ColumnType::DOUBLE); - if (builder.column_type(idx) == ColumnType::DOUBLE) - builder.append_double(idx, yyjson_get_real(sub_val)); - else - builder.append_null(idx); - } else if (yyjson_is_bool(sub_val)) { - auto idx = builder.add_or_get_column(full_key, ColumnType::BOOL); - if (builder.column_type(idx) == ColumnType::BOOL) - builder.append_bool(idx, yyjson_get_bool(sub_val)); - else - builder.append_null(idx); - } else if (yyjson_is_str(sub_val)) { - auto idx = builder.add_or_get_column(full_key, ColumnType::STRING); - if (builder.column_type(idx) == ColumnType::STRING) - builder.append_string( - idx, std::string_view(yyjson_get_str(sub_val), - yyjson_get_len(sub_val))); - else - builder.append_null(idx); - } else if (yyjson_is_null(sub_val)) { - auto existing = builder.find_column(full_key); - if (existing) builder.append_null(*existing); - } else { - // nested object/array: serialize - std::size_t json_len; - char *json_str = yyjson_val_write(sub_val, 0, &json_len); - auto idx = builder.add_or_get_column(full_key, ColumnType::STRING); - if (json_str) { - builder.append_string(idx, arena.push(json_str, json_len)); - free(json_str); - } else { - builder.append_null(idx); + auto type_result = sub_val.type(); + if (type_result.error()) continue; + auto json_type = type_result.value_unsafe(); + + switch (json_type) { + case simdjson::ondemand::json_type::number: { + auto num_result = sub_val.get_number(); + if (num_result.error()) break; + auto num = num_result.value_unsafe(); + if (num.is_int64()) { + auto idx = + builder.add_or_get_column(full_key, ColumnType::INT64); + if (builder.column_type(idx) == ColumnType::INT64) + builder.append_int64(idx, num.get_int64()); + else + builder.append_null(idx); + } else if (num.is_uint64()) { + auto idx = + builder.add_or_get_column(full_key, ColumnType::UINT64); + if (builder.column_type(idx) == ColumnType::UINT64) + builder.append_uint64(idx, num.get_uint64()); + else + builder.append_null(idx); + } else { + auto idx = + builder.add_or_get_column(full_key, ColumnType::DOUBLE); + if (builder.column_type(idx) == ColumnType::DOUBLE) + builder.append_double(idx, num.get_double()); + else + builder.append_null(idx); + } + break; + } + case simdjson::ondemand::json_type::string: { + auto str_result = sub_val.get_string(); + if (str_result.error()) break; + auto str = str_result.value_unsafe(); + auto idx = + builder.add_or_get_column(full_key, ColumnType::STRING); + if (builder.column_type(idx) == ColumnType::STRING) + builder.append_string(idx, str); + else + builder.append_null(idx); + break; } + case simdjson::ondemand::json_type::boolean: { + auto bool_result = sub_val.get_bool(); + if (bool_result.error()) break; + auto b = bool_result.value_unsafe(); + auto idx = + builder.add_or_get_column(full_key, ColumnType::BOOL); + if (builder.column_type(idx) == ColumnType::BOOL) + builder.append_bool(idx, b); + else + builder.append_null(idx); + break; + } + case simdjson::ondemand::json_type::null: { + auto existing = builder.find_column(full_key); + if (existing) builder.append_null(*existing); + break; + } + case simdjson::ondemand::json_type::object: + case simdjson::ondemand::json_type::array: { + // Serialize nested object/array to JSON string + auto json_str = SVH::to_json_string(sub_val); + auto idx = + builder.add_or_get_column(full_key, ColumnType::STRING); + if (json_str) { + builder.append_string( + idx, arena.push(json_str->data(), json_str->size())); + } else { + builder.append_null(idx); + } + break; + } + default: + break; } } } -CoroTask produce_arrow_batches(std::shared_ptr state, - TraceReaderConfig cfg, ReadConfig rc, - std::size_t batch_size, - bool flatten_objects = false, - bool normalize = false) { - auto *sp = state.get(); - try { - TraceReader reader(std::move(cfg)); - auto gen = reader.read_lines(rc); - RecordBatchBuilder builder; - builder.reserve(batch_size); +static bool build_arrow_row(RecordBatchBuilder &builder, JsonParser &parser, + StringArena &arena, bool normalize) { + if (normalize) return normalize_row(builder, arena, parser); + + using SVH = JsonValueHelper; + parser.for_each_field([&](std::string_view key_sv, + simdjson::ondemand::value val) { + auto type_result = val.type(); + if (type_result.error()) return; + auto json_type = type_result.value_unsafe(); + switch (json_type) { + case simdjson::ondemand::json_type::number: { + auto num_result = val.get_number(); + if (num_result.error()) break; + auto num = num_result.value_unsafe(); + if (num.is_int64()) { + std::size_t idx = + builder.add_or_get_column(key_sv, ColumnType::INT64); + builder.append_int64(idx, num.get_int64()); + } else if (num.is_uint64()) { + std::size_t idx = + builder.add_or_get_column(key_sv, ColumnType::UINT64); + builder.append_uint64(idx, num.get_uint64()); + } else { + std::size_t idx = + builder.add_or_get_column(key_sv, ColumnType::DOUBLE); + builder.append_double(idx, num.get_double()); + } + break; + } + case simdjson::ondemand::json_type::string: { + auto str_result = val.get_string(); + if (str_result.error()) break; + auto str = str_result.value_unsafe(); + std::size_t idx = + builder.add_or_get_column(key_sv, ColumnType::STRING); + builder.append_string(idx, str); + break; + } + case simdjson::ondemand::json_type::boolean: { + auto bool_result = val.get_bool(); + if (bool_result.error()) break; + auto b = bool_result.value_unsafe(); + std::size_t idx = + builder.add_or_get_column(key_sv, ColumnType::BOOL); + builder.append_bool(idx, b); + break; + } + case simdjson::ondemand::json_type::null: { + auto existing = builder.find_column(key_sv); + if (existing) builder.append_null(*existing); + break; + } + case simdjson::ondemand::json_type::object: + case simdjson::ondemand::json_type::array: { + auto json_str = SVH::to_json_string(val); + std::size_t idx = + builder.add_or_get_column(key_sv, ColumnType::STRING); + if (json_str) { + builder.append_string( + idx, arena.push(json_str->data(), json_str->size())); + } else { + builder.append_null(idx); + } + break; + } + default: + break; + } + }); + builder.end_row(); + return true; +} - std::vector held_docs; - StringArena arena; - held_docs.reserve(batch_size); +static bool process_json_line(RecordBatchBuilder &builder, JsonParser &parser, + StringArena &arena, std::string_view content, + bool normalize) { + const char *trimmed; + std::size_t trimmed_length; + if (!dftracer::utils::json_trim_and_validate_with_comma( + content.data(), content.size(), trimmed, trimmed_length)) + return false; + if (!parser.parse(std::string_view(trimmed, trimmed_length))) return false; + return build_arrow_row(builder, parser, arena, normalize); +} - while (auto opt = co_await gen.next()) { - if (sp->cancelled.load(std::memory_order_acquire)) break; +static CoroTask produce_arrow_for_file( + dftracer::utils::coro::Channel *chan, + std::string file_path, std::string index_dir, std::size_t checkpoint_size, + bool auto_build_index, ReadConfig rc, std::size_t batch_size, + bool normalize, std::atomic *cancelled) { + dftracer::utils::coro::ChannelProducer producer(chan); + auto guard = producer.guard(); - const char *trimmed; - std::size_t trimmed_length; - if (!dftracer::utils::json_trim_and_validate( - opt->content.data(), opt->content.size(), trimmed, - trimmed_length)) { - continue; - } + TraceReaderConfig cfg; + cfg.file_path = std::move(file_path); + cfg.index_dir = std::move(index_dir); + cfg.checkpoint_size = checkpoint_size; + cfg.auto_build_index = auto_build_index; + + TraceReader reader(std::move(cfg)); + + // Fast path: non-normalized Arrow build happens inside TraceReader. + // Normalize still goes through read_json + build_arrow_row for the + // richer schema derivation. + if (!normalize) { + auto batch_gen = reader.read_arrow(rc, batch_size); + while (auto batch_opt = co_await batch_gen.next()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + if (!co_await producer.send(std::move(*batch_opt))) co_return; + } + co_return; + } - yyjson_doc *doc = yyjson_read(trimmed, trimmed_length, 0); - if (!doc) continue; + auto gen = reader.read_json(rc); + RecordBatchBuilder builder; + builder.reserve(batch_size); + StringArena arena; - yyjson_val *root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); - continue; + while (auto opt = co_await gen.next()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + if (!build_arrow_row(builder, *opt->parser, arena, normalize)) continue; + if (builder.num_rows() >= batch_size) { + auto result = builder.finish(); + arena.clear(); + if (!co_await producer.send(std::move(result))) co_return; + if (!builder.is_schema_locked()) builder.lock_schema(); + builder.reset(true); + builder.reserve(batch_size); + } + } + if (builder.num_rows() > 0) { + co_await producer.send(builder.finish()); + } + co_return; +} + +static CoroTask file_worker( + std::shared_ptr> file_chan, + dftracer::utils::coro::Channel *out_chan, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::size_t batch_size, bool normalize, + std::atomic *cancelled) { + dftracer::utils::coro::ChannelProducer producer( + out_chan); + auto guard = producer.guard(); + + while (auto file_path = co_await file_chan->receive()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + TraceReaderConfig cfg; + cfg.file_path = std::move(*file_path); + cfg.index_dir = index_dir; + cfg.checkpoint_size = checkpoint_size; + cfg.auto_build_index = auto_build_index; + + TraceReader reader(std::move(cfg)); + + if (!normalize) { + auto batch_gen = reader.read_arrow(rc, batch_size); + while (auto batch_opt = co_await batch_gen.next()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + if (!co_await producer.send(std::move(*batch_opt))) co_return; } + continue; + } - if (normalize) { - // Produce the semantic output schema directly. - // normalize_row calls end_row() internally. - if (!normalize_row(builder, arena, root)) { - yyjson_doc_free(doc); - continue; - } - held_docs.push_back(doc); - } else { - yyjson_obj_iter iter; - yyjson_obj_iter_init(root, &iter); - yyjson_val *key; - while ((key = yyjson_obj_iter_next(&iter))) { - yyjson_val *val = yyjson_obj_iter_get_val(key); - const char *key_str = yyjson_get_str(key); - std::size_t key_len = yyjson_get_len(key); - std::string_view key_sv(key_str, key_len); - - if (yyjson_is_int(val)) { - std::size_t idx = builder.add_or_get_column( - key_sv, ColumnType::INT64); - builder.append_int64(idx, yyjson_get_sint(val)); - } else if (yyjson_is_uint(val)) { - std::size_t idx = builder.add_or_get_column( - key_sv, ColumnType::UINT64); - builder.append_uint64(idx, yyjson_get_uint(val)); - } else if (yyjson_is_real(val)) { - std::size_t idx = builder.add_or_get_column( - key_sv, ColumnType::DOUBLE); - builder.append_double(idx, yyjson_get_real(val)); - } else if (yyjson_is_bool(val)) { - std::size_t idx = - builder.add_or_get_column(key_sv, ColumnType::BOOL); - builder.append_bool(idx, yyjson_get_bool(val)); - } else if (yyjson_is_str(val)) { - std::size_t idx = builder.add_or_get_column( - key_sv, ColumnType::STRING); - builder.append_string( - idx, std::string_view(yyjson_get_str(val), - yyjson_get_len(val))); - } else if (yyjson_is_null(val)) { - auto existing = builder.find_column(key_sv); - if (existing) builder.append_null(*existing); - } else { - std::size_t json_len; - char *json_str = yyjson_val_write(val, 0, &json_len); - std::size_t idx = builder.add_or_get_column( - key_sv, ColumnType::STRING); - if (json_str) { - builder.append_string( - idx, arena.push(json_str, json_len)); - free(json_str); - } else { - builder.append_null(idx); - } - } - } - builder.end_row(); - held_docs.push_back(doc); - } // end else (raw path) + auto gen = reader.read_json(rc); + RecordBatchBuilder builder; + builder.reserve(batch_size); + StringArena arena; + while (auto opt = co_await gen.next()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + if (!build_arrow_row(builder, *opt->parser, arena, normalize)) + continue; if (builder.num_rows() >= batch_size) { auto result = builder.finish(); - for (auto *d : held_docs) yyjson_doc_free(d); - held_docs.clear(); arena.clear(); - - { - std::unique_lock lock(sp->mtx); - sp->cv_producer.wait(lock, [sp] { - return sp->queue.size() < sp->max_queue_size || - sp->cancelled.load(std::memory_order_acquire); - }); - if (sp->cancelled.load(std::memory_order_acquire)) break; - sp->queue.push(std::move(result)); - } - sp->cv_consumer.notify_one(); - builder.reset(false); + if (!co_await producer.send(std::move(result))) co_return; + if (!builder.is_schema_locked()) builder.lock_schema(); + builder.reset(true); builder.reserve(batch_size); } } - if (builder.num_rows() > 0) { - auto result = builder.finish(); - for (auto *d : held_docs) yyjson_doc_free(d); - held_docs.clear(); - arena.clear(); - { - std::lock_guard lock(sp->mtx); - sp->queue.push(std::move(result)); - } - sp->cv_consumer.notify_one(); - } else { - for (auto *d : held_docs) yyjson_doc_free(d); + if (!co_await producer.send(builder.finish())) co_return; } - } catch (...) { - std::lock_guard lock(sp->mtx); - sp->error = std::current_exception(); - sp->queue.push(std::nullopt); - sp->done.store(true, std::memory_order_release); - sp->cv_consumer.notify_one(); - co_return; } - { - std::lock_guard lock(sp->mtx); - sp->queue.push(std::nullopt); - sp->done.store(true, std::memory_order_release); - } - sp->cv_consumer.notify_one(); + co_return; } -#endif // DFTRACER_UTILS_ENABLE_ARROW +// Extract AND-of-EQ leaves from a Query AST. Returns nullopt if the predicate +// shape is anything else (NE, range ops, IN, NOT, OR), in which case the +// uniform-match shortcut does not apply. +static std::optional>> +extract_eq_leaves( + const dftracer::utils::utilities::common::query::QueryNode &node) { + namespace q_ns = dftracer::utils::utilities::common::query; + using LeafVec = std::vector>; + + auto literal_to_string = [](const q_ns::LiteralNode &lit) -> std::string { + return std::visit( + [](auto &&v) -> std::string { + using T = std::decay_t; + if constexpr (std::is_same_v) + return v; + else if constexpr (std::is_same_v) + return v ? "true" : "false"; + else if constexpr (std::is_same_v) + return std::to_string(v); + else if constexpr (std::is_same_v) + return std::to_string(v); + else if constexpr (std::is_same_v) + return std::to_string(v); + else + return {}; + }, + lit.value); + }; -TraceReaderConfig build_config(TraceReaderObject *self) { - TraceReaderConfig cfg; - cfg.file_path = PyUnicode_AsUTF8(self->file_path); - const char *idx = PyUnicode_AsUTF8(self->index_dir); - if (idx) cfg.index_dir = idx; - cfg.checkpoint_size = self->checkpoint_size; - cfg.auto_build_index = self->auto_build_index != 0; - cfg.index_threshold = self->index_threshold; - return cfg; + return std::visit( + [&](const auto &n) -> std::optional { + using T = std::decay_t; + if constexpr (std::is_same_v) { + if (n.op != q_ns::CompareOp::EQ) return std::nullopt; + return LeafVec{{n.field.path, literal_to_string(n.value)}}; + } else if constexpr (std::is_same_v) { + auto l = extract_eq_leaves(*n.left); + if (!l) return std::nullopt; + auto r = extract_eq_leaves(*n.right); + if (!r) return std::nullopt; + l->insert(l->end(), r->begin(), r->end()); + return l; + } else { + return std::nullopt; + } + }, + node.data); } -static Runtime *get_runtime(TraceReaderObject *self) { - if (self->runtime_obj) { - return ((RuntimeObject *)self->runtime_obj)->runtime.get(); +// True iff every checkpoint in `chunk_idxs` has dim_stats min == max == literal +// for every leaf. Empty leaves -> false (no shortcut). Missing dim_stats for +// any (chunk, leaf) -> false (we don't know, play safe). +static bool all_chunks_uniform_match( + const dftracer::utils::utilities::indexer::IndexDatabase &db, int fid, + const std::vector> &leaves, + const std::vector &chunk_idxs) { + if (leaves.empty() || chunk_idxs.empty()) return false; + namespace indexing = dftracer::utils::utilities::composites::dft::indexing; + + for (const auto &[dim, val] : leaves) { + auto rows = db.query_chunk_dimension_stats_for_dimension(fid, dim); + if (rows.empty()) return false; + std::unordered_map + by_ckpt; + by_ckpt.reserve(rows.size()); + for (const auto &r : rows) by_ckpt.emplace(r.checkpoint_idx, &r); + for (auto cidx : chunk_idxs) { + auto it = by_ckpt.find(cidx); + if (it == by_ckpt.end()) return false; + const auto &ds = *it->second; + if (ds.min_value != val || ds.max_value != val) return false; + } } - return get_default_runtime(); + return true; } -static TraceReaderIteratorObject *make_iterator( - std::shared_ptr state, IteratorMode mode) { - TraceReaderIteratorObject *it = - (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc( - &TraceReaderIteratorType, 0); - if (!it) return NULL; - new (&it->state) std::shared_ptr(std::move(state)); -#ifdef DFTRACER_UTILS_ENABLE_ARROW - new (&it->arrow_state) std::shared_ptr(); -#endif - it->mode = mode; - return it; -} +// Byte-range work unit for checkpoint-level parallelism. Each unit covers +// one or more consecutive checkpoints from a single file. Decompression of +// a single gz file is sequential per gzip stream, so splitting at +// checkpoint-aligned byte offsets is what lets multiple workers share the +// decode work for one file. +struct ArrowWorkItem { + std::string file_path; + std::size_t start_byte = 0; + std::size_t end_byte = 0; + bool start_at_checkpoint = false; + bool end_at_checkpoint = false; + // When true, every kept chunk for this byte range is uniform-matching + // (dim_stats min == max == predicate literal for every AND-of-EQ leaf), + // so per-event predicate eval is skippable. + bool chunk_prune_only = false; + // Line-range work items override byte ranges: the worker passes these + // down as LINE_RANGE on the read, and the gzip stream resolves them to + // byte offsets via the checkpoint index. 0 = no line constraint. + std::size_t start_line = 0; + std::size_t end_line = 0; +}; -#ifdef DFTRACER_UTILS_ENABLE_ARROW -static TraceReaderIteratorObject *make_arrow_iterator( - std::shared_ptr state) { - TraceReaderIteratorObject *it = - (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc( - &TraceReaderIteratorType, 0); - if (!it) return NULL; - new (&it->state) std::shared_ptr(); - new (&it->arrow_state) - std::shared_ptr(std::move(state)); - it->mode = IteratorMode::ARROW; - return it; -} -#endif +static std::vector enumerate_work_items( + const std::vector &files, const std::string &index_dir, + const std::string &query_str, std::size_t max_workers, + std::size_t clip_start_byte = 0, std::size_t clip_end_byte = 0, + std::size_t clip_start_line = 0, std::size_t clip_end_line = 0) { + namespace dft_internal = + dftracer::utils::utilities::composites::dft::internal; + namespace indexer_ns = dftracer::utils::utilities::indexer; + namespace indexing = dftracer::utils::utilities::composites::dft::indexing; + + std::vector items; + items.reserve(files.size() * 4); + + const bool has_line_clip = (clip_start_line > 0 || clip_end_line > 0); + auto push_unsplit = [&](const std::string &fp) { + ArrowWorkItem item; + item.file_path = fp; + item.start_line = clip_start_line; + item.end_line = clip_end_line; + items.push_back(std::move(item)); + }; -} // namespace + // Parse the query once. Pruner input copies a Query, so we keep the + // parsed form around to feed each ChunkPrunerInput without re-parsing. + std::optional parsed; + if (!query_str.empty()) { + auto r = dftracer::utils::utilities::common::query::Query::from_string( + query_str); + if (r) parsed = std::move(*r); + } -using dftracer::utils::python::wrap_arrow_table; + // All files in a directory-mode scan share the same `.dftindex` root. + // Group files by their resolved index path so we can open the RocksDB + // once per index and reuse it to prune every file against that handle. + std::unordered_map> by_index; + for (std::size_t i = 0; i < files.size(); ++i) { + std::string index_path = + dft_internal::determine_index_path(files[i], index_dir); + by_index[index_path].push_back(i); + } + + for (auto &entry : by_index) { + const auto &index_path = entry.first; + const auto &file_idxs = entry.second; + if (!fs::exists(index_path)) { + for (auto i : file_idxs) push_unsplit(files[i]); + continue; + } + std::unique_ptr idx_db; + try { + idx_db = std::make_unique( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + } catch (...) { + for (auto i : file_idxs) push_unsplit(files[i]); + continue; + } + + // Resolve fid + checkpoints per file (cheap queries). + struct FileCtx { + std::size_t file_idx; + int fid; + std::vector ckpts; + }; + std::vector file_ctxs; + file_ctxs.reserve(file_idxs.size()); + for (auto i : file_idxs) { + FileCtx fc; + fc.file_idx = i; + fc.fid = idx_db->get_file_info_id( + indexer_ns::internal::get_logical_path(files[i])); + if (fc.fid < 0) { + push_unsplit(files[i]); + continue; + } + fc.ckpts = idx_db->query_checkpoints(fc.fid); + if (fc.ckpts.empty()) { + push_unsplit(files[i]); + continue; + } + std::sort(fc.ckpts.begin(), fc.ckpts.end(), + [](const auto &a, const auto &b) { + return a.first_line_num < b.first_line_num; + }); + file_ctxs.push_back(std::move(fc)); + } + + // Batch-prune all files against the shared index: dim_stats and + // chunk_statistics are loaded in one RocksDB scan each instead of + // one scan per file. + std::vector pruner_outs(file_ctxs.size()); + if (parsed && !file_ctxs.empty()) { + indexing::ChunkPrunerBatchInput batch_in; + batch_in.index_path = index_path; + batch_in.external_db = idx_db.get(); + batch_in.items.reserve(file_ctxs.size()); + for (auto &fc : file_ctxs) { + batch_in.items.push_back({files[fc.file_idx], *parsed}); + } + indexing::ChunkPrunerUtility pruner; + auto batch_out = pruner.process_batch(batch_in); + if (batch_out.success) { + pruner_outs = std::move(batch_out.outputs); + } + } + + // For AND-of-EQ predicates, precompute uniform-match leaves once. + // Per-file pure_match is checked inline below and lets workers skip + // per-event predicate eval on chunks where dim_stats min == max == + // literal for every leaf. + std::optional>> + eq_leaves; + if (parsed) eq_leaves = extract_eq_leaves(parsed->root()); + + for (std::size_t fc_idx = 0; fc_idx < file_ctxs.size(); ++fc_idx) { + auto &fc = file_ctxs[fc_idx]; + const auto &fp = files[fc.file_idx]; + + // Pruner chunk_idx semantics: 0-indexed over uncompressed + // slices. fc.ckpts holds gzip recovery points; recovery point + // fc.ckpts[k] sits at the START of pruner chunk (k+1). Pruner + // chunk 0 has no recovery point at its start (decoded from + // gzip stream start). Total pruner chunks = fc.ckpts.size()+1. + const std::size_t total_chunks = fc.ckpts.size() + 1; + auto chunk_start_byte = [&](std::uint64_t cidx) -> std::size_t { + if (cidx == 0) return 0; + return fc.ckpts[cidx - 1].uc_offset; + }; + auto chunk_end_byte = [&](std::uint64_t cidx) -> std::size_t { + if (cidx == 0) + return fc.ckpts.empty() ? 0 : fc.ckpts[0].uc_offset; + std::size_t k = cidx - 1; + return fc.ckpts[k].uc_offset + fc.ckpts[k].uc_size; + }; + // Line ranges for a chunk. Chunk 0 covers everything before the + // first recovery point; chunk k>=1 spans recovery point (k-1). + auto chunk_first_line = [&](std::uint64_t cidx) -> std::size_t { + if (cidx == 0) return 1; + return fc.ckpts[cidx - 1].first_line_num; + }; + auto chunk_last_line = [&](std::uint64_t cidx) -> std::size_t { + if (cidx == 0) { + if (fc.ckpts.empty()) return SIZE_MAX; + return fc.ckpts[0].first_line_num > 0 + ? fc.ckpts[0].first_line_num - 1 + : 0; + } + return fc.ckpts[cidx - 1].last_line_num; + }; + + std::vector keep_chunks; + keep_chunks.reserve(total_chunks); + if (parsed) { + const auto &pr = pruner_outs[fc_idx]; + if (pr.success && !pr.file_may_match) { + continue; // whole file pruned + } + if (pr.success && !pr.candidate_checkpoints.empty() && + pr.candidate_checkpoints.size() < pr.total_checkpoints) { + for (auto cidx : pr.candidate_checkpoints) { + if (cidx < total_chunks) keep_chunks.push_back(cidx); + } + std::sort(keep_chunks.begin(), keep_chunks.end()); + keep_chunks.erase( + std::unique(keep_chunks.begin(), keep_chunks.end()), + keep_chunks.end()); + } else { + for (std::uint64_t c = 0; c < total_chunks; ++c) + keep_chunks.push_back(c); + } + } else { + for (std::uint64_t c = 0; c < total_chunks; ++c) + keep_chunks.push_back(c); + } + + // Intersect with the user's line range so workers only touch + // chunks that actually overlap it. Each work item carries the + // sub-line-range; LINE_RANGE on the read maps it back to bytes + // via the same checkpoint table the gzip stream uses. + if (has_line_clip) { + std::size_t lo = clip_start_line > 0 ? clip_start_line : 1; + std::size_t hi = clip_end_line > 0 ? clip_end_line : SIZE_MAX; + std::vector filtered; + filtered.reserve(keep_chunks.size()); + for (auto c : keep_chunks) { + std::size_t cf = chunk_first_line(c); + std::size_t cl = chunk_last_line(c); + if (cl < lo || cf > hi) continue; + filtered.push_back(c); + } + keep_chunks = std::move(filtered); + } + + if (keep_chunks.empty()) continue; + + // All-or-nothing per file: if every kept chunk is uniform-matching + // for every leaf, every work item from this file gets the + // chunk_prune_only fast path. Mixed files fall back to per-event + // eval to stay safe. + bool file_pure_match = false; + if (eq_leaves && !eq_leaves->empty() && idx_db) { + file_pure_match = all_chunks_uniform_match( + *idx_db, fc.fid, *eq_leaves, keep_chunks); + } + + std::size_t target_ranges = std::max(1, max_workers); + std::size_t per_range = std::max( + 1, (keep_chunks.size() + target_ranges - 1) / target_ranges); + + std::size_t group_start = 0; + while (group_start < keep_chunks.size()) { + std::size_t group_end = group_start; + std::size_t emitted = 0; + while (group_end < keep_chunks.size() && emitted < per_range) { + if (group_end > group_start && + keep_chunks[group_end] != + keep_chunks[group_end - 1] + 1) { + break; + } + ++group_end; + ++emitted; + } + std::uint64_t scidx = keep_chunks[group_start]; + std::uint64_t ecidx = keep_chunks[group_end - 1]; + std::size_t start_byte = chunk_start_byte(scidx); + std::size_t end_byte = chunk_end_byte(ecidx); + // start_at_checkpoint: a gzip recovery point sits at + // start_byte (true for any cidx>=1; false for the implicit + // chunk 0 which decodes from stream start). + bool start_at_checkpoint = (scidx >= 1); + bool end_at_checkpoint = (group_end < keep_chunks.size()); + if (has_line_clip) { + std::size_t lo = clip_start_line > 0 ? clip_start_line : 1; + std::size_t hi = + clip_end_line > 0 ? clip_end_line : SIZE_MAX; + std::size_t cluster_first = chunk_first_line(scidx); + std::size_t cluster_last = chunk_last_line(ecidx); + std::size_t item_start = + std::max(lo, cluster_first); + std::size_t item_end = + std::min(hi, cluster_last); + if (item_start > item_end) { + group_start = group_end; + continue; + } + ArrowWorkItem item; + item.file_path = fp; + item.chunk_prune_only = file_pure_match; + item.start_line = item_start; + item.end_line = item_end; + items.push_back(std::move(item)); + group_start = group_end; + continue; + } + if (clip_end_byte > clip_start_byte) { + if (start_byte < clip_start_byte) { + start_byte = clip_start_byte; + start_at_checkpoint = false; + } + if (end_byte > clip_end_byte) { + end_byte = clip_end_byte; + end_at_checkpoint = false; + } + if (start_byte >= end_byte) { + group_start = group_end; + continue; + } + } + items.push_back({fp, start_byte, end_byte, start_at_checkpoint, + end_at_checkpoint, file_pure_match}); + group_start = group_end; + } + } + } + return items; +} + +static CoroTask send_work_items_to_channel( + std::shared_ptr> chan, + const std::vector *items, std::atomic *cancelled) { + for (const auto &it : *items) { + if (cancelled->load(std::memory_order_acquire)) break; + if (!co_await chan->send(it)) break; + } + chan->close(); + co_return; +} + +static CoroTask checkpoint_worker( + std::shared_ptr> work_chan, + dftracer::utils::coro::Channel *out_chan, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::size_t batch_size, bool normalize, + std::atomic *cancelled) { + dftracer::utils::coro::ChannelProducer producer( + out_chan); + auto guard = producer.guard(); + + // Cache readers keyed by file path so we don't re-probe the same file + // when successive work items land on it. + std::unordered_map> readers; + + while (auto item = co_await work_chan->receive()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + + auto &reader_ptr = readers[item->file_path]; + if (!reader_ptr) { + TraceReaderConfig cfg; + cfg.file_path = item->file_path; + cfg.index_dir = index_dir; + cfg.checkpoint_size = checkpoint_size; + cfg.auto_build_index = auto_build_index; + reader_ptr = std::make_shared(std::move(cfg)); + } + + ReadConfig local_rc = rc; + if (item->start_line > 0 || item->end_line > 0) { + // Line-range work items: the read drives off LINE_RANGE; the + // gzip stream resolves it back to byte offsets via checkpoints. + local_rc.start_line = item->start_line; + local_rc.end_line = item->end_line; + local_rc.start_byte = 0; + local_rc.end_byte = 0; + local_rc.start_at_checkpoint = false; + local_rc.end_at_checkpoint = false; + } else { + local_rc.start_byte = item->start_byte; + local_rc.end_byte = item->end_byte; + local_rc.start_at_checkpoint = item->start_at_checkpoint; + local_rc.end_at_checkpoint = item->end_at_checkpoint; + } + // Pruning already happened at enumeration time; avoid the per- + // work-item RocksDB opens that would otherwise dwarf the actual + // read cost at directory scale (256 files * N ranges). + local_rc.skip_pruning = true; + // chunks pre-classified as uniform-matching skip per-event eval. + if (item->chunk_prune_only) local_rc.chunk_prune_only = true; + + if (!normalize) { + auto batch_gen = reader_ptr->read_arrow(local_rc, batch_size); + while (auto batch_opt = co_await batch_gen.next()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + if (!co_await producer.send(std::move(*batch_opt))) co_return; + } + continue; + } + + auto gen = reader_ptr->read_json(local_rc); + RecordBatchBuilder builder; + builder.reserve(batch_size); + StringArena arena; + + while (auto opt = co_await gen.next()) { + if (cancelled->load(std::memory_order_acquire)) co_return; + if (!build_arrow_row(builder, *opt->parser, arena, normalize)) + continue; + if (builder.num_rows() >= batch_size) { + auto result = builder.finish(); + arena.clear(); + if (!co_await producer.send(std::move(result))) co_return; + if (!builder.is_schema_locked()) builder.lock_schema(); + builder.reset(true); + builder.reserve(batch_size); + } + } + if (builder.num_rows() > 0) { + if (!co_await producer.send(builder.finish())) co_return; + } + } + co_return; +} + +static CoroTask spawn_arrow_producers( + CoroScope &child, + dftracer::utils::coro::Channel *out_chan, + const std::vector *work_items, const std::string *index_dir, + std::size_t checkpoint_size, bool auto_build_index, const ReadConfig *rc, + std::size_t batch_size, bool normalize, std::atomic *cancelled_ptr, + std::size_t max_workers) { + std::size_t num_workers = std::min(work_items->size(), max_workers); + if (num_workers == 0) num_workers = 1; + auto work_chan = + dftracer::utils::coro::make_channel(num_workers); + + for (std::size_t i = 0; i < num_workers; ++i) { + child.spawn([out_chan, wc = work_chan, idx = *index_dir, + checkpoint_size, auto_build_index, r = *rc, batch_size, + normalize, cancelled_ptr](CoroScope &) { + return checkpoint_worker(wc, out_chan, idx, checkpoint_size, + auto_build_index, r, batch_size, normalize, + cancelled_ptr); + }); + } + + child.spawn([wc = work_chan, work_items, cancelled_ptr](CoroScope &) { + return send_work_items_to_channel(wc, work_items, cancelled_ptr); + }); + co_return; +} + +static CoroTask produce_arrow_batches_for_files( + CoroScope &scope, ArrowIteratorState *sp, std::vector files, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::size_t batch_size, bool normalize, + std::size_t max_workers) { + try { + if (files.empty()) { + sp->channel->close(); + co_return; + } + + auto work_items = enumerate_work_items( + files, index_dir, rc.query, max_workers, rc.start_byte, rc.end_byte, + rc.start_line, rc.end_line); + if (work_items.empty()) { + sp->channel->close(); + co_return; + } + + auto *chan_ptr = sp->channel.get(); + auto *cancelled_ptr = &sp->cancelled; + + co_await scope.scope([chan_ptr, &work_items, &index_dir, + checkpoint_size, auto_build_index, &rc, + batch_size, normalize, cancelled_ptr, + max_workers](CoroScope &child) -> CoroTask { + co_await spawn_arrow_producers( + child, chan_ptr, &work_items, &index_dir, checkpoint_size, + auto_build_index, &rc, batch_size, normalize, cancelled_ptr, + max_workers); + }); + } catch (...) { + sp->set_error(std::current_exception()); + } +} + +static CoroTask produce_arrow_batches_parallel( + CoroScope &scope, ArrowIteratorState *sp, std::string dir_path, + std::string index_dir, std::size_t checkpoint_size, bool auto_build_index, + ReadConfig rc, std::size_t batch_size, bool normalize, + std::size_t max_workers) { + try { + PatternDirectoryScannerUtility scanner; + auto scan_input = PatternDirectoryScannerUtilityInput( + dir_path, {".pfw", ".pfw.gz"}, true, false); + auto entries = co_await scope.spawn(scanner, scan_input); + + std::vector files; + files.reserve(entries.size()); + for (auto &e : entries) files.push_back(e.path.string()); + std::sort(files.begin(), files.end()); + + co_await produce_arrow_batches_for_files( + scope, sp, std::move(files), std::move(index_dir), checkpoint_size, + auto_build_index, std::move(rc), batch_size, normalize, + max_workers); + } catch (...) { + sp->set_error(std::current_exception()); + } +} + +CoroTask produce_arrow_batches( + std::shared_ptr state, + dftracer::utils::coro::ChannelProducer producer, + TraceReaderConfig cfg, ReadConfig rc, std::size_t batch_size, + bool flatten_objects = false, bool normalize = false) { + (void)flatten_objects; + + auto guard = producer.guard(); + try { + TraceReader reader(std::move(cfg)); + + if (!normalize) { + auto batch_gen = reader.read_arrow(rc, batch_size); + while (auto batch_opt = co_await batch_gen.next()) { + if (state->cancelled.load(std::memory_order_acquire)) break; + auto result_bytes = + dftracer::utils::python::byte_size(*batch_opt); + state->bytes_in_queue.fetch_add(result_bytes, + std::memory_order_acq_rel); + if (!co_await producer.send(std::move(*batch_opt))) break; + } + co_return; + } + + auto gen = reader.read_json(rc); + RecordBatchBuilder builder; + builder.reserve(batch_size); + + StringArena arena; + + while (auto opt = co_await gen.next()) { + if (state->cancelled.load(std::memory_order_acquire)) break; + if (!build_arrow_row(builder, *opt->parser, arena, normalize)) + continue; + + if (builder.num_rows() >= batch_size) { + auto result = builder.finish(); + arena.clear(); + auto result_bytes = dftracer::utils::python::byte_size(result); + state->bytes_in_queue.fetch_add(result_bytes, + std::memory_order_acq_rel); + if (!co_await producer.send(std::move(result))) break; + if (!builder.is_schema_locked()) { + builder.lock_schema(); + } + builder.reset(true); + builder.reserve(batch_size); + } + } + + if (builder.num_rows() > 0 && + !state->cancelled.load(std::memory_order_acquire)) { + auto result = builder.finish(); + auto result_bytes = dftracer::utils::python::byte_size(result); + state->bytes_in_queue.fetch_add(result_bytes, + std::memory_order_acq_rel); + co_await producer.send(std::move(result)); + } + } catch (...) { + state->set_error(std::current_exception()); + } +} + +#endif // DFTRACER_UTILS_ENABLE_ARROW + +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +struct WriteArrowStats { + std::unordered_map partitions; + int64_t total_rows = 0; + int64_t total_uncompressed_bytes = 0; +}; + +struct WriteArrowResult { + WriteArrowStats stats; + std::string error; + std::uint64_t chunks_scanned = 0; + std::uint64_t chunks_skipped = 0; +}; + +CoroTask write_arrow_pipeline( + std::string file_path, std::string index_path, std::size_t checkpoint_size, + std::vector views, std::string output_path, + int64_t chunk_size_bytes, IpcCompression compression, + std::size_t event_batch_size) { + namespace dft_internal = + dftracer::utils::utilities::composites::dft::internal; + WriteArrowResult result; + + try { + if (views.empty()) { + views.push_back(ViewDefinition().with_name("all")); + } + + std::string resolved_index = + index_path.empty() + ? dft_internal::determine_index_path(file_path, "") + : index_path; + + auto meta_input = MetadataCollectorUtilityInput::from_file(file_path) + .with_checkpoint_size(checkpoint_size) + .with_index(resolved_index); + auto metadata = co_await MetadataCollectorUtility{}.process(meta_input); + if (!metadata.success) { + result.error = + "Failed to collect metadata: " + metadata.error_message; + co_return result; + } + + for (const auto &view : views) { + std::string view_output = output_path; + if (views.size() > 1 || view.name != "all") { + view_output = output_path + "/" + view.name; + } + + PartitionWriter writer; + int rc_open = co_await writer.open(view_output, chunk_size_bytes, + compression); + if (rc_open != 0) { + result.error = + "Failed to open partition writer for view: " + view.name; + co_return result; + } + + ViewBuilderInput builder_input; + builder_input.with_view(view) + .with_file_path(file_path) + .with_index_path(resolved_index) + .with_uncompressed_size(metadata.uncompressed_size) + .with_num_checkpoints(metadata.num_checkpoints); + + auto build_output = + co_await ViewBuilderUtility{}.process(builder_input); + if (!build_output.success) { + result.error = "ViewBuilder failed for view: " + view.name; + co_return result; + } + + result.chunks_skipped += build_output.skipped_checkpoints; + + if (!build_output.file_may_match) { + auto stats = co_await writer.close(); + result.stats.partitions[view.name] = std::move(stats); + continue; + } + + RecordBatchBuilder builder; + bool schema_locked = false; + + for (const auto &candidate : build_output.candidates) { + ViewReaderInput reader_input; + reader_input.with_file_path(file_path) + .with_index_path(resolved_index) + .with_checkpoint_size(checkpoint_size) + .with_byte_range(candidate.start_byte, candidate.end_byte) + .with_checkpoint_idx(candidate.checkpoint_idx) + .with_event_batch_size(event_batch_size) + .with_view(view); + reader_input.query = view.query; + + ViewReaderUtility reader; + auto gen = reader.process(reader_input); + while (auto opt = co_await gen.next()) { + auto arrow_batch = opt->to_arrow(builder); + int rc_write = co_await writer.write_batch(arrow_batch); + if (rc_write != 0) { + result.error = + "Failed to write batch for view: " + view.name; + co_return result; + } + if (!schema_locked) { + builder.lock_schema(); + schema_locked = true; + } + builder.reset(true); + } + result.chunks_scanned++; + } + + auto stats = co_await writer.close(); + result.stats.partitions[view.name] = std::move(stats); + result.stats.total_rows += + result.stats.partitions[view.name].total_rows; + result.stats.total_uncompressed_bytes += + result.stats.partitions[view.name].total_uncompressed_bytes; + } + } catch (const std::exception &e) { + result.error = e.what(); + } + co_return result; +} + +struct ViewChunkInfo { + std::uint64_t checkpoint_idx; + std::size_t start_byte; + std::size_t end_byte; +}; + +struct GetViewChunksResult { + std::vector chunks; + std::uint64_t total_checkpoints = 0; + std::uint64_t skipped_checkpoints = 0; + bool file_may_match = false; + std::string error; +}; + +CoroTask get_view_chunks_pipeline( + std::string file_path, std::string index_path, std::size_t checkpoint_size, + ViewDefinition view) { + namespace dft_internal = + dftracer::utils::utilities::composites::dft::internal; + GetViewChunksResult result; + + try { + std::string resolved_index = + index_path.empty() + ? dft_internal::determine_index_path(file_path, "") + : index_path; + + auto meta_input = MetadataCollectorUtilityInput::from_file(file_path) + .with_checkpoint_size(checkpoint_size) + .with_index(resolved_index); + auto metadata = co_await MetadataCollectorUtility{}.process(meta_input); + if (!metadata.success) { + result.error = + "Failed to collect metadata: " + metadata.error_message; + co_return result; + } + + ViewBuilderInput builder_input; + builder_input.with_view(view) + .with_file_path(file_path) + .with_index_path(resolved_index) + .with_uncompressed_size(metadata.uncompressed_size) + .with_num_checkpoints(metadata.num_checkpoints); + + auto build_output = + co_await ViewBuilderUtility{}.process(builder_input); + if (!build_output.success) { + result.error = "ViewBuilder failed"; + co_return result; + } + + result.file_may_match = build_output.file_may_match; + result.total_checkpoints = build_output.total_checkpoints; + result.skipped_checkpoints = build_output.skipped_checkpoints; + + for (const auto &candidate : build_output.candidates) { + result.chunks.push_back({candidate.checkpoint_idx, + candidate.start_byte, candidate.end_byte}); + } + } catch (const std::exception &e) { + result.error = e.what(); + } + co_return result; +} + +struct WriteViewChunkResult { + std::string output_file; + std::uint64_t events_matched = 0; + std::uint64_t events_scanned = 0; + int64_t rows_written = 0; + int64_t bytes_written = 0; + std::string error; +}; + +CoroTask write_view_chunk_pipeline( + std::string file_path, std::string index_path, std::size_t checkpoint_size, + ViewDefinition view, std::uint64_t checkpoint_idx, std::size_t start_byte, + std::size_t end_byte, std::string output_file, IpcCompression compression, + std::size_t event_batch_size) { + namespace dft_internal = + dftracer::utils::utilities::composites::dft::internal; + WriteViewChunkResult result; + result.output_file = output_file; + + try { + std::string resolved_index = + index_path.empty() + ? dft_internal::determine_index_path(file_path, "") + : index_path; + + dftracer::utils::utilities::common::arrow::IpcWriter writer; + int rc_open = co_await writer.open(output_file, compression); + if (rc_open != 0) { + result.error = "Failed to open output file"; + co_return result; + } + + ViewReaderInput reader_input; + reader_input.with_file_path(file_path) + .with_index_path(resolved_index) + .with_checkpoint_size(checkpoint_size) + .with_byte_range(start_byte, end_byte) + .with_checkpoint_idx(checkpoint_idx) + .with_event_batch_size(event_batch_size) + .with_view(view); + reader_input.query = view.query; + + RecordBatchBuilder builder; + bool schema_locked = false; + + ViewReaderUtility reader; + auto gen = reader.process(reader_input); + while (auto opt = co_await gen.next()) { + result.events_matched += opt->events_matched; + result.events_scanned += opt->events_scanned; + auto batch = opt->to_arrow(builder); + if (batch.valid()) { + result.rows_written += batch.num_rows(); + int rc = co_await writer.write_batch(batch); + if (rc != 0) { + result.error = "Failed to write batch"; + co_return result; + } + if (!schema_locked) { + builder.lock_schema(); + schema_locked = true; + } + builder.reset(true); + } + } + + int rc = co_await writer.close(); + if (rc != 0) { + result.error = "Failed to close output file"; + } + } catch (const std::exception &e) { + result.error = e.what(); + } + co_return result; +} + +struct ChunkDescriptor { + std::uint64_t checkpoint_idx; + std::size_t start_byte; + std::size_t end_byte; + std::string output_file; +}; + +struct WriteViewChunksResult { + std::vector results; + int64_t total_rows = 0; + int64_t total_events_matched = 0; +}; + +CoroTask write_view_chunks_pipeline( + std::string file_path, std::string index_path, std::size_t checkpoint_size, + ViewDefinition view, std::vector chunks, + IpcCompression compression, std::size_t event_batch_size) { + WriteViewChunksResult result; + + if (chunks.empty()) { + co_return result; + } + + std::vector> tasks; + tasks.reserve(chunks.size()); + + for (const auto &chunk : chunks) { + tasks.push_back(write_view_chunk_pipeline( + file_path, index_path, checkpoint_size, view, chunk.checkpoint_idx, + chunk.start_byte, chunk.end_byte, chunk.output_file, compression, + event_batch_size)); + } + + result.results = co_await when_all(std::move(tasks)); + + for (const auto &r : result.results) { + result.total_rows += r.rows_written; + result.total_events_matched += r.events_matched; + } + + co_return result; +} + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC + +TraceReaderConfig build_config(TraceReaderObject *self) { + TraceReaderConfig cfg; + cfg.file_path = PyUnicode_AsUTF8(self->file_path); + const char *idx = PyUnicode_AsUTF8(self->index_dir); + if (idx) cfg.index_dir = idx; + cfg.checkpoint_size = self->checkpoint_size; + cfg.auto_build_index = self->auto_build_index != 0; + return cfg; +} + +static Runtime *get_runtime(TraceReaderObject *self) { + if (self->runtime_obj) { + return ((RuntimeObject *)self->runtime_obj)->runtime.get(); + } + return get_default_runtime(); +} + +static TraceReaderIteratorObject *make_memoryview_iterator( + std::shared_ptr state) { + TraceReaderIteratorObject *it = + (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc( + &TraceReaderIteratorType, 0); + if (!it) return NULL; + new (&it->batch_state) + std::shared_ptr(std::move(state)); + it->current_batch = NULL; + it->batch_index = 0; + new (&it->json_dict_state) std::shared_ptr(); + new (&it->json_dict_current_batch) std::shared_ptr(); + it->json_dict_index = 0; +#ifdef DFTRACER_UTILS_ENABLE_ARROW + new (&it->arrow_state) std::shared_ptr(); +#endif + it->mode = IteratorMode::MEMORYVIEW; + return it; +} + +static TraceReaderIteratorObject *make_json_dict_iterator( + std::shared_ptr state) { + TraceReaderIteratorObject *it = + (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc( + &TraceReaderIteratorType, 0); + if (!it) return NULL; + new (&it->batch_state) std::shared_ptr(); + it->current_batch = NULL; + it->batch_index = 0; + new (&it->json_dict_state) + std::shared_ptr(std::move(state)); + new (&it->json_dict_current_batch) std::shared_ptr(); + it->json_dict_index = 0; +#ifdef DFTRACER_UTILS_ENABLE_ARROW + new (&it->arrow_state) std::shared_ptr(); +#endif + it->mode = IteratorMode::JSON_DICT; + return it; +} + +#ifdef DFTRACER_UTILS_ENABLE_ARROW +static TraceReaderIteratorObject *make_arrow_iterator( + std::shared_ptr state) { + TraceReaderIteratorObject *it = + (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc( + &TraceReaderIteratorType, 0); + if (!it) return NULL; + new (&it->batch_state) std::shared_ptr(); + it->current_batch = NULL; + it->batch_index = 0; + new (&it->json_dict_state) std::shared_ptr(); + new (&it->json_dict_current_batch) std::shared_ptr(); + it->json_dict_index = 0; + new (&it->arrow_state) + std::shared_ptr(std::move(state)); + it->mode = IteratorMode::ARROW; + return it; +} +#endif + +} // namespace static void TraceReader_dealloc(TraceReaderObject *self) { Py_XDECREF(self->file_path); @@ -764,8 +2246,6 @@ static PyObject *TraceReader_new(PyTypeObject *type, PyObject *args, self->index_dir = NULL; self->checkpoint_size = 32 * 1024 * 1024; self->auto_build_index = 0; - self->index_threshold = - dftracer::utils::constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; self->has_index = 0; self->runtime_obj = NULL; } @@ -774,26 +2254,19 @@ static PyObject *TraceReader_new(PyTypeObject *type, PyObject *args, static int TraceReader_init(TraceReaderObject *self, PyObject *args, PyObject *kwds) { - static const char *kwlist[] = {"file_path", - "index_dir", - "checkpoint_size", - "auto_build_index", - "index_threshold", - "runtime", - NULL}; + static const char *kwlist[] = { + "path", "index_dir", "checkpoint_size", "auto_build_index", + "runtime", NULL}; const char *file_path; const char *index_dir = ""; std::size_t checkpoint_size = 32 * 1024 * 1024; int auto_build_index = 0; - std::size_t index_threshold = - dftracer::utils::constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; PyObject *runtime_arg = NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|snpnO", (char **)kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|snpO", (char **)kwlist, &file_path, &index_dir, &checkpoint_size, - &auto_build_index, &index_threshold, - &runtime_arg)) { + &auto_build_index, &runtime_arg)) { return -1; } @@ -828,7 +2301,6 @@ static int TraceReader_init(TraceReaderObject *self, PyObject *args, self->checkpoint_size = checkpoint_size; self->auto_build_index = auto_build_index; - self->index_threshold = index_threshold; try { TraceReaderConfig cfg; @@ -836,7 +2308,6 @@ static int TraceReader_init(TraceReaderObject *self, PyObject *args, cfg.index_dir = index_dir; cfg.checkpoint_size = checkpoint_size; cfg.auto_build_index = auto_build_index != 0; - cfg.index_threshold = index_threshold; TraceReader probe(std::move(cfg)); self->has_index = probe.has_index() ? 1 : 0; } catch (const std::exception &e) { @@ -853,17 +2324,18 @@ static int TraceReader_init(TraceReaderObject *self, PyObject *args, static PyObject *TraceReader_iter_lines(TraceReaderObject *self, PyObject *args, PyObject *kwds) { - static const char *kwlist[] = {"start_line", "end_line", "start_byte", - "end_byte", "buffer_size", "query", - NULL}; + static const char *kwlist[] = {"start_line", "end_line", "start_byte", + "end_byte", "buffer_size", "query", + "memory_budget", NULL}; Py_ssize_t start_line = 0, end_line = 0; Py_ssize_t start_byte = 0, end_byte = 0; Py_ssize_t buffer_size = 4 * 1024 * 1024; const char *query_str = NULL; + Py_ssize_t memory_budget = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnz", (char **)kwlist, - &start_line, &end_line, &start_byte, - &end_byte, &buffer_size, &query_str)) { + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "|nnnnnzn", (char **)kwlist, &start_line, &end_line, + &start_byte, &end_byte, &buffer_size, &query_str, &memory_budget)) { return NULL; } @@ -891,18 +2363,47 @@ static PyObject *TraceReader_iter_lines(TraceReaderObject *self, PyObject *args, rc.buffer_size = static_cast(buffer_size); if (query_str) rc.query = query_str; - auto state = std::make_shared(); + auto state = std::make_shared(); + state->memory_budget_bytes = dftracer::utils::compute_memory_budget( + static_cast(memory_budget)); Runtime *rt = get_runtime(self); + std::size_t max_workers = rt->threads(); + constexpr std::size_t LINE_BATCH_SIZE = 1024; + std::size_t capacity = dftracer::utils::compute_channel_capacity( + state->memory_budget_bytes, LINE_BATCH_SIZE * ESTIMATED_BYTES_PER_LINE, + max_workers); + state->channel = + dftracer::utils::coro::make_channel(capacity); + auto *sp = state.get(); + try { - auto handle = rt->submit(produce_lines(state, cfg, rc), "iter_lines"); - state->task_future = handle.future; + bool is_dir = fs::is_directory(cfg.file_path); + if (is_dir) { + auto handle = rt->scope( + "iter_lines_parallel", + [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir, + checkpoint_size = cfg.checkpoint_size, + auto_build_index = cfg.auto_build_index, rc, + max_workers](CoroScope &scope) -> CoroTask { + co_await produce_lines_parallel( + scope, sp, dir_path, index_dir, checkpoint_size, + auto_build_index, rc, LINE_BATCH_SIZE, max_workers); + }); + state->task_future = handle.future; + } else { + auto handle = rt->submit( + produce_lines_batched(state, state->channel->producer(), cfg, + rc, LINE_BATCH_SIZE), + "iter_lines"); + state->task_future = handle.future; + } } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; } - TraceReaderIteratorObject *it = make_iterator(state, IteratorMode::LINES); + TraceReaderIteratorObject *it = make_memoryview_iterator(std::move(state)); return (PyObject *)it; } @@ -910,18 +2411,20 @@ static PyObject *TraceReader_iter_raw(TraceReaderObject *self, PyObject *args, PyObject *kwds) { static const char *kwlist[] = {"start_line", "end_line", "start_byte", "end_byte", "buffer_size", "line_aligned", - "multi_line", "query", NULL}; + "multi_line", "query", "memory_budget", + NULL}; Py_ssize_t start_line = 0, end_line = 0; Py_ssize_t start_byte = 0, end_byte = 0; Py_ssize_t buffer_size = 4 * 1024 * 1024; int line_aligned = 1; int multi_line = 1; const char *query_str = NULL; + Py_ssize_t memory_budget = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnppz", (char **)kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnppzn", (char **)kwlist, &start_line, &end_line, &start_byte, &end_byte, &buffer_size, &line_aligned, - &multi_line, &query_str)) { + &multi_line, &query_str, &memory_budget)) { return NULL; } @@ -951,18 +2454,44 @@ static PyObject *TraceReader_iter_raw(TraceReaderObject *self, PyObject *args, rc.multi_line = multi_line != 0; if (query_str) rc.query = query_str; - auto state = std::make_shared(); + auto state = std::make_shared(); + state->memory_budget_bytes = dftracer::utils::compute_memory_budget( + static_cast(memory_budget)); Runtime *rt = get_runtime(self); + std::size_t max_workers = rt->threads(); + std::size_t capacity = dftracer::utils::compute_channel_capacity( + state->memory_budget_bytes, ESTIMATED_BYTES_PER_RAW_CHUNK, max_workers); + state->channel = + dftracer::utils::coro::make_channel(capacity); + auto *sp = state.get(); + try { - auto handle = rt->submit(produce_raw(state, cfg, rc), "iter_raw"); - state->task_future = handle.future; + bool is_dir = fs::is_directory(cfg.file_path); + if (is_dir) { + auto handle = rt->scope( + "iter_raw_parallel", + [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir, + checkpoint_size = cfg.checkpoint_size, + auto_build_index = cfg.auto_build_index, rc, + max_workers](CoroScope &scope) -> CoroTask { + co_await produce_raw_parallel( + scope, sp, dir_path, index_dir, checkpoint_size, + auto_build_index, rc, max_workers); + }); + state->task_future = handle.future; + } else { + auto handle = rt->submit( + produce_raw_batched(state, state->channel->producer(), cfg, rc), + "iter_raw"); + state->task_future = handle.future; + } } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; } - TraceReaderIteratorObject *it = make_iterator(state, IteratorMode::RAW); + TraceReaderIteratorObject *it = make_memoryview_iterator(std::move(state)); return (PyObject *)it; } @@ -975,6 +2504,102 @@ static PyObject *TraceReader_read_lines(TraceReaderObject *self, PyObject *args, return list; } +static PyObject *TraceReader_iter_json(TraceReaderObject *self, PyObject *args, + PyObject *kwds) { + static const char *kwlist[] = {"start_line", "end_line", "start_byte", + "end_byte", "buffer_size", "query", + "batch_size", "memory_budget", NULL}; + Py_ssize_t start_line = 0, end_line = 0; + Py_ssize_t start_byte = 0, end_byte = 0; + Py_ssize_t buffer_size = 4 * 1024 * 1024; + const char *query_str = NULL; + Py_ssize_t batch_size = 1024; + Py_ssize_t memory_budget = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnznn", (char **)kwlist, + &start_line, &end_line, &start_byte, + &end_byte, &buffer_size, &query_str, + &batch_size, &memory_budget)) { + return NULL; + } + + if (start_line < 0 || end_line < 0 || start_byte < 0 || end_byte < 0 || + buffer_size <= 0 || batch_size <= 0) { + PyErr_SetString(PyExc_ValueError, + "range arguments must be >= 0; buffer_size and " + "batch_size must be > 0"); + return NULL; + } + + TraceReaderConfig cfg; + try { + cfg = build_config(self); + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + + ReadConfig rc; + rc.start_line = static_cast(start_line); + rc.end_line = static_cast(end_line); + rc.start_byte = static_cast(start_byte); + rc.end_byte = static_cast(end_byte); + rc.buffer_size = static_cast(buffer_size); + if (query_str) rc.query = query_str; + + auto state = std::make_shared(); + state->memory_budget_bytes = dftracer::utils::compute_memory_budget( + static_cast(memory_budget)); + + Runtime *rt = get_runtime(self); + std::size_t max_workers = rt->threads(); + auto bs = static_cast(batch_size); + std::size_t capacity = dftracer::utils::compute_channel_capacity( + state->memory_budget_bytes, bs * ESTIMATED_BYTES_PER_JSON_EVENT, + max_workers); + state->channel = + dftracer::utils::coro::make_channel(capacity); + auto *sp = state.get(); + + try { + bool is_dir = fs::is_directory(cfg.file_path); + if (is_dir) { + auto handle = rt->scope( + "iter_json_parallel", + [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir, + checkpoint_size = cfg.checkpoint_size, + auto_build_index = cfg.auto_build_index, rc, bs, + max_workers](CoroScope &scope) -> CoroTask { + co_await produce_json_dicts_parallel( + scope, sp, dir_path, index_dir, checkpoint_size, + auto_build_index, rc, bs, max_workers); + }); + state->task_future = handle.future; + } else { + auto handle = + rt->submit(produce_json_dicts(state, state->channel->producer(), + cfg, rc, bs), + "iter_json"); + state->task_future = handle.future; + } + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } + + TraceReaderIteratorObject *it = make_json_dict_iterator(std::move(state)); + return (PyObject *)it; +} + +static PyObject *TraceReader_read_json_py(TraceReaderObject *self, + PyObject *args, PyObject *kwds) { + PyObject *iter = TraceReader_iter_json(self, args, kwds); + if (!iter) return NULL; + PyObject *list = PySequence_List(iter); + Py_DECREF(iter); + return list; +} + static PyObject *TraceReader_read_raw(TraceReaderObject *self, PyObject *args, PyObject *kwds) { PyObject *iter = TraceReader_iter_raw(self, args, kwds); @@ -984,22 +2609,34 @@ static PyObject *TraceReader_read_raw(TraceReaderObject *self, PyObject *args, return list; } -static PyObject *TraceReader_iter_lines_json(TraceReaderObject *self, - PyObject *args, PyObject *kwds) { - static const char *kwlist[] = {"start_line", "end_line", "start_byte", - "end_byte", "buffer_size", "query", - NULL}; +#ifdef DFTRACER_UTILS_ENABLE_ARROW + +static PyObject *TraceReader_iter_arrow(TraceReaderObject *self, PyObject *args, + PyObject *kwds) { + static const char *kwlist[] = { + "batch_size", "start_line", "end_line", "start_byte", + "end_byte", "buffer_size", "query", "flatten_objects", + "normalize", "memory_budget", NULL}; + Py_ssize_t batch_size = 10000; Py_ssize_t start_line = 0, end_line = 0; Py_ssize_t start_byte = 0, end_byte = 0; Py_ssize_t buffer_size = 4 * 1024 * 1024; const char *query_str = NULL; + int flatten_objects = 1; // default: expand top-level objects + int normalize = 0; + Py_ssize_t memory_budget = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnz", (char **)kwlist, - &start_line, &end_line, &start_byte, - &end_byte, &buffer_size, &query_str)) { + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "|nnnnnnzppn", (char **)kwlist, &batch_size, + &start_line, &end_line, &start_byte, &end_byte, &buffer_size, + &query_str, &flatten_objects, &normalize, &memory_budget)) { return NULL; } + if (batch_size <= 0) { + PyErr_SetString(PyExc_ValueError, "batch_size must be > 0"); + return NULL; + } if (start_line < 0 || end_line < 0 || start_byte < 0 || end_byte < 0 || buffer_size <= 0) { PyErr_SetString( @@ -1022,66 +2659,105 @@ static PyObject *TraceReader_iter_lines_json(TraceReaderObject *self, rc.start_byte = static_cast(start_byte); rc.end_byte = static_cast(end_byte); rc.buffer_size = static_cast(buffer_size); + rc.flatten_objects = flatten_objects != 0; if (query_str) rc.query = query_str; - auto state = std::make_shared(); + auto state = std::make_shared(); + state->memory_budget_bytes = dftracer::utils::compute_memory_budget( + static_cast(memory_budget)); Runtime *rt = get_runtime(self); + std::size_t max_workers = rt->threads(); + auto bs = static_cast(batch_size); + std::size_t capacity = dftracer::utils::compute_channel_capacity( + state->memory_budget_bytes, bs * ESTIMATED_BYTES_PER_ARROW_ROW, + max_workers); + state->channel = + dftracer::utils::coro::make_channel( + capacity); + auto *sp = state.get(); + try { - auto handle = - rt->submit(produce_lines(state, cfg, rc), "iter_lines_json"); - state->task_future = handle.future; + bool is_dir = fs::is_directory(cfg.file_path); + if (is_dir) { + auto handle = rt->scope( + "iter_arrow_parallel", + [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir, + checkpoint_size = cfg.checkpoint_size, + auto_build_index = cfg.auto_build_index, rc, bs, + norm = normalize != 0, + max_workers](CoroScope &scope) -> CoroTask { + co_await produce_arrow_batches_parallel( + scope, sp, dir_path, index_dir, checkpoint_size, + auto_build_index, rc, bs, norm, max_workers); + }); + state->task_future = handle.future; + } else if (normalize) { + auto handle = rt->submit( + produce_arrow_batches(state, state->channel->producer(), cfg, + rc, static_cast(batch_size), + flatten_objects != 0, normalize != 0), + "iter_arrow"); + state->task_future = handle.future; + } else { + std::vector files_vec{cfg.file_path}; + auto handle = rt->scope( + "iter_arrow_parallel", + [sp, files = std::move(files_vec), index_dir = cfg.index_dir, + checkpoint_size = cfg.checkpoint_size, + auto_build_index = cfg.auto_build_index, rc, bs, + norm = normalize != 0, + max_workers](CoroScope &scope) mutable -> CoroTask { + co_await produce_arrow_batches_for_files( + scope, sp, std::move(files), index_dir, checkpoint_size, + auto_build_index, rc, bs, norm, max_workers); + }); + state->task_future = handle.future; + } } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; } - TraceReaderIteratorObject *it = make_iterator(state, IteratorMode::JSON); + TraceReaderIteratorObject *it = make_arrow_iterator(std::move(state)); return (PyObject *)it; } -static PyObject *TraceReader_read_lines_json(TraceReaderObject *self, - PyObject *args, PyObject *kwds) { - PyObject *iter = TraceReader_iter_lines_json(self, args, kwds); - if (!iter) return NULL; - PyObject *list = PySequence_List(iter); - Py_DECREF(iter); - return list; -} - -#ifdef DFTRACER_UTILS_ENABLE_ARROW - -static PyObject *TraceReader_iter_arrow(TraceReaderObject *self, PyObject *args, - PyObject *kwds) { +// Build ArrowIteratorState + spawn the producer task. Same plumbing as +// TraceReader_iter_arrow but returns the state so callers can wrap it as +// either a per-batch iterator or an ArrowArrayStream. +static std::shared_ptr spawn_arrow_producer( + TraceReaderObject *self, PyObject *args, PyObject *kwds) { static const char *kwlist[] = { - "batch_size", "start_line", "end_line", "start_byte", - "end_byte", "buffer_size", "query", "flatten_objects", - "normalize", NULL}; + "batch_size", "start_line", "end_line", "start_byte", + "end_byte", "buffer_size", "query", "flatten_objects", + "normalize", "memory_budget", NULL}; Py_ssize_t batch_size = 10000; Py_ssize_t start_line = 0, end_line = 0; Py_ssize_t start_byte = 0, end_byte = 0; Py_ssize_t buffer_size = 4 * 1024 * 1024; const char *query_str = NULL; - int flatten_objects = 0; + int flatten_objects = 1; // default: expand top-level objects int normalize = 0; + Py_ssize_t memory_budget = 0; if (!PyArg_ParseTupleAndKeywords( - args, kwds, "|nnnnnnzpp", (char **)kwlist, &batch_size, &start_line, - &end_line, &start_byte, &end_byte, &buffer_size, &query_str, - &flatten_objects, &normalize)) { - return NULL; + args, kwds, "|nnnnnnzppn", (char **)kwlist, &batch_size, + &start_line, &end_line, &start_byte, &end_byte, &buffer_size, + &query_str, &flatten_objects, &normalize, &memory_budget)) { + return nullptr; } if (batch_size <= 0) { PyErr_SetString(PyExc_ValueError, "batch_size must be > 0"); - return NULL; + return nullptr; } if (start_line < 0 || end_line < 0 || start_byte < 0 || end_byte < 0 || buffer_size <= 0) { PyErr_SetString( PyExc_ValueError, "range arguments must be >= 0; buffer_size must be > 0"); - return NULL; + return nullptr; } TraceReaderConfig cfg; @@ -1089,7 +2765,7 @@ static PyObject *TraceReader_iter_arrow(TraceReaderObject *self, PyObject *args, cfg = build_config(self); } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); - return NULL; + return nullptr; } ReadConfig rc; @@ -1098,40 +2774,702 @@ static PyObject *TraceReader_iter_arrow(TraceReaderObject *self, PyObject *args, rc.start_byte = static_cast(start_byte); rc.end_byte = static_cast(end_byte); rc.buffer_size = static_cast(buffer_size); + rc.flatten_objects = flatten_objects != 0; if (query_str) rc.query = query_str; auto state = std::make_shared(); + state->memory_budget_bytes = dftracer::utils::compute_memory_budget( + static_cast(memory_budget)); Runtime *rt = get_runtime(self); + std::size_t max_workers = rt->threads(); + auto bs = static_cast(batch_size); + std::size_t capacity = dftracer::utils::compute_channel_capacity( + state->memory_budget_bytes, bs * ESTIMATED_BYTES_PER_ARROW_ROW, + max_workers); + state->channel = + dftracer::utils::coro::make_channel( + capacity); + auto *sp = state.get(); + try { - auto handle = - rt->submit(produce_arrow_batches( - state, cfg, rc, static_cast(batch_size), - flatten_objects != 0, normalize != 0), - "iter_arrow"); - state->task_future = handle.future; + bool is_dir = fs::is_directory(cfg.file_path); + if (is_dir) { + auto handle = rt->scope( + "iter_arrow_parallel", + [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir, + checkpoint_size = cfg.checkpoint_size, + auto_build_index = cfg.auto_build_index, rc, bs, + norm = normalize != 0, + max_workers](CoroScope &scope) -> CoroTask { + co_await produce_arrow_batches_parallel( + scope, sp, dir_path, index_dir, checkpoint_size, + auto_build_index, rc, bs, norm, max_workers); + }); + state->task_future = handle.future; + } else { + auto handle = rt->submit( + produce_arrow_batches(state, state->channel->producer(), cfg, + rc, static_cast(batch_size), + flatten_objects != 0, normalize != 0), + "iter_arrow"); + state->task_future = handle.future; + } } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); - return NULL; + return nullptr; } - TraceReaderIteratorObject *it = make_arrow_iterator(std::move(state)); - return (PyObject *)it; + return state; +} + +static PyObject *TraceReader_iter_arrow_stream(TraceReaderObject *self, + PyObject *args, PyObject *kwds) { + auto state = spawn_arrow_producer(self, args, kwds); + if (!state) return NULL; + return make_arrow_batch_stream(std::move(state)); } static PyObject *TraceReader_read_arrow(TraceReaderObject *self, PyObject *args, PyObject *kwds) { - PyObject *iter = TraceReader_iter_arrow(self, args, kwds); - if (!iter) return NULL; - PyObject *list = PySequence_List(iter); - Py_DECREF(iter); - if (!list) return NULL; - - return wrap_arrow_table(list); + auto state = spawn_arrow_producer(self, args, kwds); + if (!state) return NULL; + PyObject *stream = make_arrow_batch_stream(std::move(state)); + if (!stream) return NULL; + return dftracer::utils::python::wrap_arrow_stream_table(stream); } #endif // DFTRACER_UTILS_ENABLE_ARROW +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +static int parse_str_list_trace(PyObject *obj, std::vector &out, + const char *param_name) { + if (!obj || obj == Py_None) return 0; + if (!PyList_Check(obj)) { + PyErr_Format(PyExc_TypeError, "%s must be a list of str", param_name); + return -1; + } + Py_ssize_t n = PyList_Size(obj); + for (Py_ssize_t i = 0; i < n; i++) { + const char *s = PyUnicode_AsUTF8(PyList_GetItem(obj, i)); + if (!s) return -1; + out.emplace_back(s); + } + return 0; +} + +static PyObject *TraceReader_write_arrow(TraceReaderObject *self, + PyObject *args, PyObject *kwds) { + static const char *kwlist[] = {"path", "views", "chunk_size_mb", + "compression", "batch_size", NULL}; + const char *path = NULL; + PyObject *views_obj = Py_None; + int chunk_size_mb = 32; + const char *compression_str = "zstd"; + Py_ssize_t batch_size = 10000; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|Oisn", (char **)kwlist, + &path, &views_obj, &chunk_size_mb, + &compression_str, &batch_size)) { + return NULL; + } + + if (chunk_size_mb < 0) { + PyErr_SetString(PyExc_ValueError, "chunk_size_mb must be >= 0"); + return NULL; + } + + std::vector views; + if (views_obj && views_obj != Py_None) { + if (!PyList_Check(views_obj)) { + PyErr_SetString(PyExc_TypeError, "views must be a list or None"); + return NULL; + } + Py_ssize_t n = PyList_Size(views_obj); + for (Py_ssize_t i = 0; i < n; i++) { + PyObject *item = PyList_GetItem(views_obj, i); + ViewDefinition vd; + + if (PyUnicode_Check(item)) { + const char *name = PyUnicode_AsUTF8(item); + if (!name) return NULL; + std::string name_str(name); + if (name_str == "io") { + vd = ViewDefinition::io_view(); + } else if (name_str == "compute") { + vd = ViewDefinition::compute_view(); + } else if (name_str == "dlio") { + vd = ViewDefinition::dlio_view(); + } else { + vd.with_name(name_str); + } + } else if (PyDict_Check(item)) { + PyObject *name_obj = PyDict_GetItemString(item, "name"); + if (!name_obj || !PyUnicode_Check(name_obj)) { + PyErr_SetString(PyExc_ValueError, + "view dict must have 'name' string"); + return NULL; + } + vd.with_name(PyUnicode_AsUTF8(name_obj)); + + PyObject *query_obj = PyDict_GetItemString(item, "query"); + if (query_obj && query_obj != Py_None) { + if (!PyUnicode_Check(query_obj)) { + PyErr_SetString(PyExc_ValueError, + "view 'query' must be a string"); + return NULL; + } + vd.with_query(PyUnicode_AsUTF8(query_obj)); + } + + PyObject *meta_obj = + PyDict_GetItemString(item, "include_metadata"); + if (meta_obj && meta_obj != Py_None) { + vd.with_include_metadata(PyObject_IsTrue(meta_obj)); + } + } else { + PyErr_SetString(PyExc_TypeError, + "views list must contain strings or dicts"); + return NULL; + } + views.push_back(std::move(vd)); + } + } + + IpcCompression compression = IpcCompression::ZSTD; + if (compression_str) { + std::string comp_lower(compression_str); + for (auto &c : comp_lower) c = std::tolower(c); + if (comp_lower == "none") { + compression = IpcCompression::NONE; + } else if (comp_lower == "zstd") { +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + compression = IpcCompression::ZSTD; +#else + PyErr_SetString( + PyExc_ValueError, + "ZSTD compression not available (built without ZSTD)"); + return NULL; +#endif + } else { + PyErr_Format(PyExc_ValueError, + "Unknown compression: %s (use 'none' or 'zstd')", + compression_str); + return NULL; + } + } + + int64_t chunk_size_bytes = + static_cast(chunk_size_mb) * 1024 * 1024; + + std::string file_path = PyUnicode_AsUTF8(self->file_path); + std::string index_path; + const char *idx = PyUnicode_AsUTF8(self->index_dir); + if (idx && idx[0] != '\0') { + index_path = idx; + } + std::size_t checkpoint_size = self->checkpoint_size; + + std::string output_path(path); + WriteArrowResult result; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + Runtime *rt = get_runtime(self); + result = + rt->submit(write_arrow_pipeline( + file_path, index_path, checkpoint_size, + std::move(views), output_path, chunk_size_bytes, + compression, static_cast(batch_size)), + "write_arrow") + .get(); + } catch (const std::exception &e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return NULL; + } + + if (!result.error.empty()) { + PyErr_SetString(PyExc_RuntimeError, result.error.c_str()); + return NULL; + } + + // Build result dict + PyObject *dict = PyDict_New(); + if (!dict) return NULL; + + // Build files list per partition + PyObject *partitions_dict = PyDict_New(); + if (!partitions_dict) { + Py_DECREF(dict); + return NULL; + } + + for (const auto &[partition_name, partition_stats] : + result.stats.partitions) { + PyObject *partition_dict = PyDict_New(); + if (!partition_dict) { + Py_DECREF(partitions_dict); + Py_DECREF(dict); + return NULL; + } + + PyObject *files_list = PyList_New(0); + if (!files_list) { + Py_DECREF(partition_dict); + Py_DECREF(partitions_dict); + Py_DECREF(dict); + return NULL; + } + + for (const auto &f : partition_stats.files) { + PyObject *file_str = PyUnicode_FromString(f.c_str()); + if (!file_str || PyList_Append(files_list, file_str) < 0) { + Py_XDECREF(file_str); + Py_DECREF(files_list); + Py_DECREF(partition_dict); + Py_DECREF(partitions_dict); + Py_DECREF(dict); + return NULL; + } + Py_DECREF(file_str); + } + + PyDict_SetItemString(partition_dict, "files", files_list); + PyDict_SetItemString(partition_dict, "rows", + PyLong_FromLongLong(partition_stats.total_rows)); + PyDict_SetItemString( + partition_dict, "bytes", + PyLong_FromLongLong(partition_stats.total_uncompressed_bytes)); + Py_DECREF(files_list); + + PyObject *key = partition_name.empty() + ? PyUnicode_FromString("_default") + : PyUnicode_FromString(partition_name.c_str()); + PyDict_SetItem(partitions_dict, key, partition_dict); + Py_DECREF(key); + Py_DECREF(partition_dict); + } + + PyDict_SetItemString(dict, "partitions", partitions_dict); + PyDict_SetItemString(dict, "total_rows", + PyLong_FromLongLong(result.stats.total_rows)); + PyDict_SetItemString( + dict, "total_bytes", + PyLong_FromLongLong(result.stats.total_uncompressed_bytes)); + PyDict_SetItemString(dict, "chunks_scanned", + PyLong_FromUnsignedLongLong(result.chunks_scanned)); + PyDict_SetItemString(dict, "chunks_skipped", + PyLong_FromUnsignedLongLong(result.chunks_skipped)); + Py_DECREF(partitions_dict); + + return dict; +} + +static PyObject *TraceReader_get_view_chunks(TraceReaderObject *self, + PyObject *args, PyObject *kwds) { + static const char *kwlist[] = {"view", NULL}; + PyObject *view_obj = Py_None; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", (char **)kwlist, + &view_obj)) { + return NULL; + } + + ViewDefinition view; + if (view_obj && view_obj != Py_None) { + if (PyUnicode_Check(view_obj)) { + const char *name = PyUnicode_AsUTF8(view_obj); + if (!name) return NULL; + std::string name_str(name); + if (name_str == "io") { + view = ViewDefinition::io_view(); + } else if (name_str == "compute") { + view = ViewDefinition::compute_view(); + } else if (name_str == "dlio") { + view = ViewDefinition::dlio_view(); + } else { + view.with_name(name_str); + } + } else if (PyDict_Check(view_obj)) { + PyObject *name_obj = PyDict_GetItemString(view_obj, "name"); + if (name_obj && PyUnicode_Check(name_obj)) { + view.with_name(PyUnicode_AsUTF8(name_obj)); + } + PyObject *query_obj = PyDict_GetItemString(view_obj, "query"); + if (query_obj && query_obj != Py_None && + PyUnicode_Check(query_obj)) { + view.with_query(PyUnicode_AsUTF8(query_obj)); + } + } else { + PyErr_SetString(PyExc_TypeError, "view must be a string or dict"); + return NULL; + } + } + + std::string file_path = PyUnicode_AsUTF8(self->file_path); + std::string index_path; + const char *idx = PyUnicode_AsUTF8(self->index_dir); + if (idx && idx[0] != '\0') { + index_path = idx; + } + std::size_t checkpoint_size = self->checkpoint_size; + + GetViewChunksResult result; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + Runtime *rt = get_runtime(self); + result = rt->submit(get_view_chunks_pipeline(file_path, index_path, + checkpoint_size, view), + "get_view_chunks") + .get(); + } catch (const std::exception &e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return NULL; + } + + if (!result.error.empty()) { + PyErr_SetString(PyExc_RuntimeError, result.error.c_str()); + return NULL; + } + + PyObject *dict = PyDict_New(); + if (!dict) return NULL; + + PyObject *chunks_list = PyList_New(result.chunks.size()); + if (!chunks_list) { + Py_DECREF(dict); + return NULL; + } + + for (std::size_t i = 0; i < result.chunks.size(); ++i) { + const auto &chunk = result.chunks[i]; + PyObject *chunk_dict = PyDict_New(); + if (!chunk_dict) { + Py_DECREF(chunks_list); + Py_DECREF(dict); + return NULL; + } + PyDict_SetItemString(chunk_dict, "checkpoint_idx", + PyLong_FromUnsignedLongLong(chunk.checkpoint_idx)); + PyDict_SetItemString(chunk_dict, "start_byte", + PyLong_FromSize_t(chunk.start_byte)); + PyDict_SetItemString(chunk_dict, "end_byte", + PyLong_FromSize_t(chunk.end_byte)); + PyList_SetItem(chunks_list, i, chunk_dict); + } + + PyDict_SetItemString(dict, "chunks", chunks_list); + PyDict_SetItemString(dict, "total_checkpoints", + PyLong_FromUnsignedLongLong(result.total_checkpoints)); + PyDict_SetItemString( + dict, "skipped_checkpoints", + PyLong_FromUnsignedLongLong(result.skipped_checkpoints)); + PyDict_SetItemString(dict, "file_may_match", + PyBool_FromLong(result.file_may_match ? 1 : 0)); + Py_DECREF(chunks_list); + + return dict; +} + +static PyObject *TraceReader_write_view_chunk(TraceReaderObject *self, + PyObject *args, PyObject *kwds) { + static const char *kwlist[] = { + "output_file", "checkpoint_idx", "start_byte", "end_byte", + "view", "compression", "batch_size", NULL}; + const char *output_file = NULL; + unsigned long long checkpoint_idx = 0; + Py_ssize_t start_byte = 0; + Py_ssize_t end_byte = 0; + PyObject *view_obj = Py_None; + const char *compression_str = "zstd"; + Py_ssize_t batch_size = 10000; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "sKnn|Osn", (char **)kwlist, + &output_file, &checkpoint_idx, &start_byte, + &end_byte, &view_obj, &compression_str, + &batch_size)) { + return NULL; + } + + IpcCompression compression = IpcCompression::ZSTD; + if (compression_str) { + std::string comp_lower(compression_str); + for (auto &c : comp_lower) c = std::tolower(c); + if (comp_lower == "none") { + compression = IpcCompression::NONE; + } else if (comp_lower == "zstd") { +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + compression = IpcCompression::ZSTD; +#else + PyErr_SetString(PyExc_ValueError, "ZSTD compression not available"); + return NULL; +#endif + } + } + + ViewDefinition view; + if (view_obj && view_obj != Py_None) { + if (PyUnicode_Check(view_obj)) { + const char *name = PyUnicode_AsUTF8(view_obj); + if (!name) return NULL; + std::string name_str(name); + if (name_str == "io") { + view = ViewDefinition::io_view(); + } else if (name_str == "compute") { + view = ViewDefinition::compute_view(); + } else if (name_str == "dlio") { + view = ViewDefinition::dlio_view(); + } else { + view.with_name(name_str); + } + } else if (PyDict_Check(view_obj)) { + PyObject *name_obj = PyDict_GetItemString(view_obj, "name"); + if (name_obj && PyUnicode_Check(name_obj)) { + view.with_name(PyUnicode_AsUTF8(name_obj)); + } + PyObject *query_obj = PyDict_GetItemString(view_obj, "query"); + if (query_obj && query_obj != Py_None && + PyUnicode_Check(query_obj)) { + view.with_query(PyUnicode_AsUTF8(query_obj)); + } + } + } + + std::string file_path = PyUnicode_AsUTF8(self->file_path); + std::string index_path; + const char *idx = PyUnicode_AsUTF8(self->index_dir); + if (idx && idx[0] != '\0') { + index_path = idx; + } + std::size_t checkpoint_size = self->checkpoint_size; + + WriteViewChunkResult result; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + Runtime *rt = get_runtime(self); + result = + rt->submit(write_view_chunk_pipeline( + file_path, index_path, checkpoint_size, view, + checkpoint_idx, static_cast(start_byte), + static_cast(end_byte), + std::string(output_file), compression, + static_cast(batch_size)), + "write_view_chunk") + .get(); + } catch (const std::exception &e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return NULL; + } + + if (!result.error.empty()) { + PyErr_SetString(PyExc_RuntimeError, result.error.c_str()); + return NULL; + } + + PyObject *dict = PyDict_New(); + if (!dict) return NULL; + + PyDict_SetItemString(dict, "output_file", + PyUnicode_FromString(result.output_file.c_str())); + PyDict_SetItemString(dict, "events_matched", + PyLong_FromUnsignedLongLong(result.events_matched)); + PyDict_SetItemString(dict, "events_scanned", + PyLong_FromUnsignedLongLong(result.events_scanned)); + PyDict_SetItemString(dict, "rows_written", + PyLong_FromLongLong(result.rows_written)); + PyDict_SetItemString(dict, "bytes_written", + PyLong_FromLongLong(result.bytes_written)); + + return dict; +} + +static PyObject *TraceReader_write_view_chunks(TraceReaderObject *self, + PyObject *args, PyObject *kwds) { + static const char *kwlist[] = {"chunks", "output_dir", "view", + "compression", "batch_size", NULL}; + PyObject *chunks_list = NULL; + const char *output_dir = NULL; + PyObject *view_obj = Py_None; + const char *compression_str = "zstd"; + Py_ssize_t batch_size = 10000; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "Os|Osn", (char **)kwlist, + &chunks_list, &output_dir, &view_obj, + &compression_str, &batch_size)) { + return NULL; + } + + if (!PyList_Check(chunks_list)) { + PyErr_SetString(PyExc_TypeError, "chunks must be a list"); + return NULL; + } + + IpcCompression compression = IpcCompression::ZSTD; + if (strcmp(compression_str, "none") == 0) { + compression = IpcCompression::NONE; + } else if (strcmp(compression_str, "zstd") != 0) { + PyErr_SetString(PyExc_ValueError, + "compression must be 'zstd' or 'none'"); + return NULL; + } + + ViewDefinition view; + if (view_obj && view_obj != Py_None) { + if (PyUnicode_Check(view_obj)) { + const char *name = PyUnicode_AsUTF8(view_obj); + if (!name) return NULL; + std::string name_str(name); + if (name_str == "io") { + view = ViewDefinition::io_view(); + } else if (name_str == "compute") { + view = ViewDefinition::compute_view(); + } else if (name_str == "dlio") { + view = ViewDefinition::dlio_view(); + } else { + view.with_name(name_str); + } + } else if (PyDict_Check(view_obj)) { + PyObject *name_obj = PyDict_GetItemString(view_obj, "name"); + if (name_obj && PyUnicode_Check(name_obj)) { + view.with_name(PyUnicode_AsUTF8(name_obj)); + } + PyObject *query_obj = PyDict_GetItemString(view_obj, "query"); + if (query_obj && query_obj != Py_None && + PyUnicode_Check(query_obj)) { + view.with_query(PyUnicode_AsUTF8(query_obj)); + } + } + } + + std::vector chunks; + Py_ssize_t num_chunks = PyList_Size(chunks_list); + chunks.reserve(static_cast(num_chunks)); + + for (Py_ssize_t i = 0; i < num_chunks; i++) { + PyObject *chunk_dict = PyList_GetItem(chunks_list, i); + if (!PyDict_Check(chunk_dict)) { + PyErr_SetString(PyExc_TypeError, "each chunk must be a dict"); + return NULL; + } + + ChunkDescriptor desc; + + PyObject *cp_idx = PyDict_GetItemString(chunk_dict, "checkpoint_idx"); + PyObject *start = PyDict_GetItemString(chunk_dict, "start_byte"); + PyObject *end = PyDict_GetItemString(chunk_dict, "end_byte"); + + if (!cp_idx || !start || !end) { + PyErr_SetString( + PyExc_KeyError, + "chunk must have checkpoint_idx, start_byte, end_byte"); + return NULL; + } + + desc.checkpoint_idx = + static_cast(PyLong_AsUnsignedLongLong(cp_idx)); + desc.start_byte = + static_cast(PyLong_AsUnsignedLongLong(start)); + desc.end_byte = + static_cast(PyLong_AsUnsignedLongLong(end)); + + char filename[64]; + snprintf(filename, sizeof(filename), "chunk-%05llu.arrow", + (unsigned long long)desc.checkpoint_idx); + desc.output_file = std::string(output_dir) + "/" + filename; + + chunks.push_back(std::move(desc)); + } + + std::string file_path = PyUnicode_AsUTF8(self->file_path); + std::string index_path; + const char *idx = PyUnicode_AsUTF8(self->index_dir); + if (idx && idx[0] != '\0') { + index_path = idx; + } + std::size_t checkpoint_size = self->checkpoint_size; + + WriteViewChunksResult result; + std::string error_msg; + + Py_BEGIN_ALLOW_THREADS try { + Runtime *rt = get_runtime(self); + result = rt->submit(write_view_chunks_pipeline( + file_path, index_path, checkpoint_size, view, + std::move(chunks), compression, + static_cast(batch_size)), + "write_view_chunks") + .get(); + } catch (const std::exception &e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { + PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); + return NULL; + } + + PyObject *dict = PyDict_New(); + if (!dict) return NULL; + + PyObject *results_list = + PyList_New(static_cast(result.results.size())); + if (!results_list) { + Py_DECREF(dict); + return NULL; + } + + for (std::size_t i = 0; i < result.results.size(); i++) { + const auto &r = result.results[i]; + PyObject *item = PyDict_New(); + if (!item) { + Py_DECREF(results_list); + Py_DECREF(dict); + return NULL; + } + PyDict_SetItemString(item, "output_file", + PyUnicode_FromString(r.output_file.c_str())); + PyDict_SetItemString(item, "rows_written", + PyLong_FromLongLong(r.rows_written)); + PyDict_SetItemString(item, "events_matched", + PyLong_FromUnsignedLongLong(r.events_matched)); + if (!r.error.empty()) { + PyDict_SetItemString(item, "error", + PyUnicode_FromString(r.error.c_str())); + } + PyList_SetItem(results_list, static_cast(i), item); + } + + PyDict_SetItemString(dict, "results", results_list); + Py_DECREF(results_list); + PyDict_SetItemString(dict, "total_rows", + PyLong_FromLongLong(result.total_rows)); + PyDict_SetItemString(dict, "total_events_matched", + PyLong_FromLongLong(result.total_events_matched)); + + return dict; +} + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC + static PyObject *TraceReader_enter(TraceReaderObject *self, PyObject *Py_UNUSED(ignored)) { Py_INCREF(self); @@ -1235,9 +3573,12 @@ static PyMethodDef TraceReader_methods[] = { " start_byte (int): First byte offset (0 = beginning).\n" " end_byte (int): Last byte offset (0 = end of file).\n" " buffer_size (int): Internal read buffer size in bytes.\n"}, - {"read_raw", (PyCFunction)TraceReader_read_raw, + {"iter_json", (PyCFunction)TraceReader_iter_json, METH_VARARGS | METH_KEYWORDS, - "Read all raw chunks and return as list.\n" + "Return an iterator over parsed JSON events as Python dicts.\n" + "\n" + "Each event is parsed once in C++ (single-pass simdjson ondemand)\n" + "and yielded as a Python dict. No double-parsing overhead.\n" "\n" "Args:\n" " start_line (int): First line (0 = beginning).\n" @@ -1245,28 +3586,25 @@ static PyMethodDef TraceReader_methods[] = { " start_byte (int): First byte offset (0 = beginning).\n" " end_byte (int): Last byte offset (0 = end of file).\n" " buffer_size (int): Internal read buffer size in bytes.\n" - " line_aligned (bool): Align chunks to line boundaries.\n" - " multi_line (bool): Allow multiple lines per chunk.\n"}, - {"iter_lines_json", (PyCFunction)TraceReader_iter_lines_json, + " query (str): Optional query filter.\n" + " batch_size (int): Events per internal batch (default 1024).\n"}, + {"read_json", (PyCFunction)TraceReader_read_json_py, METH_VARARGS | METH_KEYWORDS, - "Return an iterator over parsed JSON objects.\n" + "Read all events as parsed Python dicts (list).\n" "\n" - "Args:\n" - " start_line (int): First line (0 = beginning).\n" - " end_line (int): Last line (0 = end of file).\n" - " start_byte (int): First byte offset (0 = beginning).\n" - " end_byte (int): Last byte offset (0 = end of file).\n" - " buffer_size (int): Internal read buffer size in bytes.\n"}, - {"read_lines_json", (PyCFunction)TraceReader_read_lines_json, + "Equivalent to list(iter_json(...)).\n"}, + {"read_raw", (PyCFunction)TraceReader_read_raw, METH_VARARGS | METH_KEYWORDS, - "Read all lines as parsed JSON objects.\n" + "Read all raw chunks and return as list.\n" "\n" "Args:\n" " start_line (int): First line (0 = beginning).\n" " end_line (int): Last line (0 = end of file).\n" " start_byte (int): First byte offset (0 = beginning).\n" " end_byte (int): Last byte offset (0 = end of file).\n" - " buffer_size (int): Internal read buffer size in bytes.\n"}, + " buffer_size (int): Internal read buffer size in bytes.\n" + " line_aligned (bool): Align chunks to line boundaries.\n" + " multi_line (bool): Allow multiple lines per chunk.\n"}, #ifdef DFTRACER_UTILS_ENABLE_ARROW {"iter_arrow", (PyCFunction)TraceReader_iter_arrow, METH_VARARGS | METH_KEYWORDS, @@ -1279,6 +3617,12 @@ static PyMethodDef TraceReader_methods[] = { " start_byte (int): First byte offset (0 = beginning).\n" " end_byte (int): Last byte offset (0 = end of file).\n" " buffer_size (int): Internal read buffer size in bytes.\n"}, + {"iter_arrow_stream", (PyCFunction)TraceReader_iter_arrow_stream, + METH_VARARGS | METH_KEYWORDS, + "Return an _ArrowBatchStream that exposes Arrow record batches\n" + "via the Arrow C Data Interface stream protocol\n" + "(__arrow_c_stream__). PyArrow can drain the producer channel\n" + "with a single call, without per-batch Python iteration.\n"}, {"read_arrow", (PyCFunction)TraceReader_read_arrow, METH_VARARGS | METH_KEYWORDS, "Read all events as a materialized ArrowTable.\n" @@ -1290,6 +3634,64 @@ static PyMethodDef TraceReader_methods[] = { " start_byte (int): First byte offset (0 = beginning).\n" " end_byte (int): Last byte offset (0 = end of file).\n" " buffer_size (int): Internal read buffer size in bytes.\n"}, +#endif +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + {"write_arrow", (PyCFunction)TraceReader_write_arrow, + METH_VARARGS | METH_KEYWORDS, + "Write trace data to partitioned Arrow IPC files.\n" + "\n" + "Args:\n" + " path (str): Output directory path.\n" + " partition_by (list[str] or None): Column names to partition by.\n" + " num_buckets (int): Number of hash buckets (0 = no bucketing).\n" + " chunk_size_mb (int): Max uncompressed MB per file (default 32).\n" + " compression (str): 'zstd' or 'none' (default 'zstd').\n" + " batch_size (int): Rows per internal batch (default 10000).\n" + " normalize (bool): Use normalized schema (default False).\n" + "\n" + "Returns:\n" + " dict: Statistics including partitions, total_rows, total_bytes.\n"}, + {"get_view_chunks", (PyCFunction)TraceReader_get_view_chunks, + METH_VARARGS | METH_KEYWORDS, + "Get candidate chunks for a view after bloom filter pruning.\n" + "\n" + "Args:\n" + " view (str or dict): View name ('io', 'compute', 'dlio') or\n" + " dict with 'name' and optional 'query'.\n" + "\n" + "Returns:\n" + " dict: chunks list, total_checkpoints, skipped_checkpoints.\n"}, + {"write_view_chunk", (PyCFunction)TraceReader_write_view_chunk, + METH_VARARGS | METH_KEYWORDS, + "Write a single chunk to an Arrow IPC file.\n" + "\n" + "Args:\n" + " output_file (str): Path to output Arrow IPC file.\n" + " checkpoint_idx (int): Checkpoint index.\n" + " start_byte (int): Start byte offset.\n" + " end_byte (int): End byte offset.\n" + " view (str or dict): View definition.\n" + " compression (str): 'zstd' or 'none' (default 'zstd').\n" + " batch_size (int): Events per batch (default 10000).\n" + "\n" + "Returns:\n" + " dict: output_file, events_matched, rows_written, bytes_written.\n"}, + {"write_view_chunks", (PyCFunction)TraceReader_write_view_chunks, + METH_VARARGS | METH_KEYWORDS, + "Write multiple chunks to Arrow IPC files in parallel.\n" + "\n" + "All chunks are processed concurrently on the Runtime thread pool.\n" + "\n" + "Args:\n" + " chunks (list): List of dicts with checkpoint_idx, start_byte, " + "end_byte.\n" + " output_dir (str): Directory for output Arrow IPC files.\n" + " view (str or dict): View definition.\n" + " compression (str): 'zstd' or 'none' (default 'zstd').\n" + " batch_size (int): Events per batch (default 10000).\n" + "\n" + "Returns:\n" + " dict: results list, total_rows, total_events_matched.\n"}, #endif {"get_max_bytes", (PyCFunction)TraceReader_get_max_bytes, METH_NOARGS, "Get the maximum byte position (0 if unknown for compressed\n" @@ -1307,8 +3709,8 @@ static PyMethodDef TraceReader_methods[] = { {NULL}}; static PyGetSetDef TraceReader_getsetters[] = { - {"file_path", (getter)TraceReader_get_file_path, NULL, - "Path to the trace file", NULL}, + {"path", (getter)TraceReader_get_file_path, NULL, + "Path to the trace file or directory", NULL}, {"index_dir", (getter)TraceReader_get_index_dir, NULL, "Directory for index files", NULL}, {"has_index", (getter)TraceReader_get_has_index, NULL, @@ -1340,7 +3742,6 @@ PyTypeObject TraceReaderType = { "TraceReader(file_path: str, index_dir: str = '',\n" " checkpoint_size: int = 33554432,\n" " auto_build_index: bool = False,\n" - " index_threshold: int = 1048576,\n" " runtime: Runtime | None = None)\n" "--\n" "\n" @@ -1357,9 +3758,7 @@ PyTypeObject TraceReaderType = { " building (default 32 MB).\n" " auto_build_index (bool): If True, automatically build an " "index\n" - " when none exists and the file exceeds *index_threshold*.\n" - " index_threshold (int): Minimum file size in bytes before\n" - " auto-indexing is triggered (default 8 MB).\n" + " when none exists.\n" " runtime (Runtime or None): Runtime instance for thread pool " "control.\n" " If None, uses the default global Runtime.\n" diff --git a/src/dftracer/utils/python/trace_reader.h b/src/dftracer/utils/python/trace_reader.h index f1dcddcb..ca2d3fbb 100644 --- a/src/dftracer/utils/python/trace_reader.h +++ b/src/dftracer/utils/python/trace_reader.h @@ -10,7 +10,6 @@ typedef struct { PyObject *index_dir; std::size_t checkpoint_size; int auto_build_index; - std::size_t index_threshold; int has_index; PyObject *runtime_obj; // RuntimeObject* or NULL (uses default) } TraceReaderObject; diff --git a/src/dftracer/utils/python/trace_reader_iterator.cpp b/src/dftracer/utils/python/trace_reader_iterator.cpp index 87bf54a5..36e3fba9 100644 --- a/src/dftracer/utils/python/trace_reader_iterator.cpp +++ b/src/dftracer/utils/python/trace_reader_iterator.cpp @@ -1,9 +1,9 @@ #define PY_SSIZE_T_CLEAN #include -#include +#include +#include #include #include - #ifdef DFTRACER_UTILS_ENABLE_ARROW #include @@ -135,41 +135,41 @@ PyTypeObject ArrowBatchCapsuleType = { #endif // DFTRACER_UTILS_ENABLE_ARROW +static void cancel_and_wait_batch_state(MemoryViewBatchIteratorState *bs) { + bs->cancelled.store(true, std::memory_order_release); + if (bs->channel) bs->channel->close(); + if (bs->task_future.valid()) bs->task_future.wait(); +} + +static void cancel_and_wait_json_dict_state(JsonDictIteratorState *js) { + js->cancelled.store(true, std::memory_order_release); + if (js->channel) js->channel->close(); + if (js->task_future.valid()) js->task_future.wait(); +} + static void TraceReaderIterator_dealloc(TraceReaderIteratorObject *self) { #ifdef DFTRACER_UTILS_ENABLE_ARROW if (self->arrow_state) { - auto task_future = self->arrow_state->task_future; self->arrow_state->cancelled.store(true, std::memory_order_release); - self->arrow_state->cv_producer.notify_all(); - self->arrow_state->cv_consumer.notify_all(); // wake blocked __next__ - Py_BEGIN_ALLOW_THREADS { - std::unique_lock lock(self->arrow_state->mtx); - self->arrow_state->cv_consumer.wait(lock, [self] { - return self->arrow_state->done.load(std::memory_order_acquire); - }); - } - if (task_future.valid()) { - task_future.wait(); + if (self->arrow_state->channel) self->arrow_state->channel->close(); + Py_BEGIN_ALLOW_THREADS if (self->arrow_state->task_future.valid()) { + self->arrow_state->task_future.wait(); } Py_END_ALLOW_THREADS self->arrow_state.reset(); } #endif - if (self->state) { - auto task_future = self->state->task_future; - self->state->cancelled.store(true, std::memory_order_release); - self->state->cv_producer.notify_all(); - self->state->cv_consumer.notify_all(); // wake blocked __next__ - Py_BEGIN_ALLOW_THREADS { - std::unique_lock lock(self->state->mtx); - self->state->cv_consumer.wait(lock, [self] { - return self->state->done.load(std::memory_order_acquire); - }); - } - if (task_future.valid()) { - task_future.wait(); - } - Py_END_ALLOW_THREADS self->state.reset(); + if (self->json_dict_state) { + Py_BEGIN_ALLOW_THREADS cancel_and_wait_json_dict_state( + self->json_dict_state.get()); + Py_END_ALLOW_THREADS self->json_dict_state.reset(); + } + if (self->batch_state) { + Py_BEGIN_ALLOW_THREADS cancel_and_wait_batch_state( + self->batch_state.get()); + Py_END_ALLOW_THREADS self->batch_state.reset(); } + Py_XDECREF(self->current_batch); + self->current_batch = NULL; Py_TYPE(self)->tp_free((PyObject *)self); } @@ -179,31 +179,68 @@ static PyObject *TraceReaderIterator_iter(TraceReaderIteratorObject *self) { } static PyObject *TraceReaderIterator_next(TraceReaderIteratorObject *self) { -#ifdef DFTRACER_UTILS_ENABLE_ARROW - if (self->mode == IteratorMode::ARROW) { - auto *astate = self->arrow_state.get(); - ArrowIteratorState::BatchItem batch; - bool cancelled = false; - { - Py_BEGIN_ALLOW_THREADS std::unique_lock lock( - astate->mtx); - astate->cv_consumer.wait(lock, [astate] { - return !astate->queue.empty() || - astate->cancelled.load(std::memory_order_acquire) || - astate->done.load(std::memory_order_acquire); - }); - cancelled = astate->cancelled.load(std::memory_order_acquire) && - astate->queue.empty(); - if (!cancelled) { - batch = std::move(astate->queue.front()); - astate->queue.pop(); + if (self->mode == IteratorMode::JSON_DICT) { + while (true) { + if (self->json_dict_current_batch) { + auto &events = self->json_dict_current_batch->events; + Py_ssize_t n = static_cast(events.size()); + if (self->json_dict_index < n) { + JsonDictValueObject *obj = + (JsonDictValueObject *)JsonDictValueType.tp_alloc( + &JsonDictValueType, 0); + if (!obj) return NULL; + new (&obj->batch) std::shared_ptr( + self->json_dict_current_batch); + obj->event_index = + static_cast(self->json_dict_index); + obj->is_args = false; + self->json_dict_index++; + return (PyObject *)obj; + } + self->json_dict_current_batch.reset(); + self->json_dict_index = 0; } + + auto *js = self->json_dict_state.get(); + std::optional batch; + Py_BEGIN_ALLOW_THREADS batch = js->channel->blocking_receive(); Py_END_ALLOW_THREADS + + if (!batch.has_value()) { + std::lock_guard lock(js->error_mtx); + if (js->error) { + try { + std::rethrow_exception(js->error); + } catch (const std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return NULL; + } catch (...) { + PyErr_SetString(PyExc_RuntimeError, + "Unknown error in json dict iterator"); + return NULL; + } + } + return NULL; + } + + auto dequeued_bytes = dftracer::utils::python::byte_size(*batch); + js->bytes_in_queue.fetch_sub(dequeued_bytes, + std::memory_order_acq_rel); + self->json_dict_current_batch = + std::make_shared(std::move(*batch)); + self->json_dict_index = 0; } - if (cancelled) return NULL; // StopIteration - astate->cv_producer.notify_one(); + } + +#ifdef DFTRACER_UTILS_ENABLE_ARROW + if (self->mode == IteratorMode::ARROW) { + auto *astate = self->arrow_state.get(); + std::optional batch; + Py_BEGIN_ALLOW_THREADS batch = astate->channel->blocking_receive(); + Py_END_ALLOW_THREADS - if (!batch.has_value()) { + if (!batch.has_value()) { + std::lock_guard lock(astate->error_mtx); if (astate->error) { try { std::rethrow_exception(astate->error); @@ -216,9 +253,13 @@ static PyObject *TraceReaderIterator_next(TraceReaderIteratorObject *self) { return NULL; } } - return NULL; // StopIteration + return NULL; } + auto dequeued_bytes = dftracer::utils::python::byte_size(*batch); + astate->bytes_in_queue.fetch_sub(dequeued_bytes, + std::memory_order_acq_rel); + ArrowBatchCapsuleObject *obj = (ArrowBatchCapsuleObject *)ArrowBatchCapsuleType.tp_alloc( &ArrowBatchCapsuleType, 0); @@ -228,72 +269,54 @@ static PyObject *TraceReaderIterator_next(TraceReaderIteratorObject *self) { } #endif - auto *state = self->state.get(); - - // Loop to skip non-JSON lines without recursion (avoids stack overflow - // on files with many delimiter lines like "[" and "]"). + using namespace dftracer::utils::python; while (true) { - std::optional item; - bool cancelled = false; - - { - Py_BEGIN_ALLOW_THREADS std::unique_lock lock( - state->mtx); - state->cv_consumer.wait(lock, [state] { - return !state->queue.empty() || - state->cancelled.load(std::memory_order_acquire) || - state->done.load(std::memory_order_acquire); - }); - cancelled = state->cancelled.load(std::memory_order_acquire) && - state->queue.empty(); - if (!cancelled) { - item = std::move(state->queue.front()); - state->queue.pop(); + if (self->current_batch) { + auto *batch_obj = (MemoryViewBatchObject *)self->current_batch; + Py_ssize_t n = + static_cast(batch_obj->data->num_entries()); + if (self->batch_index < n) { + PyObject *mv = + MemoryViewBatch_item(batch_obj, self->batch_index); + self->batch_index++; + return mv; } - Py_END_ALLOW_THREADS + Py_DECREF(self->current_batch); + self->current_batch = NULL; + self->batch_index = 0; } - if (cancelled) return NULL; // StopIteration - state->cv_producer.notify_one(); - if (!item.has_value()) { - if (state->error) { + auto *bs = self->batch_state.get(); + std::optional batch_data; + Py_BEGIN_ALLOW_THREADS batch_data = bs->channel->blocking_receive(); + Py_END_ALLOW_THREADS + + if (!batch_data.has_value()) { + std::lock_guard lock(bs->error_mtx); + if (bs->error) { try { - std::rethrow_exception(state->error); + std::rethrow_exception(bs->error); } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; } catch (...) { PyErr_SetString(PyExc_RuntimeError, - "Unknown error in TraceReaderIterator"); + "Unknown error in batch iterator"); return NULL; } } - return NULL; // StopIteration + return NULL; } - switch (self->mode) { - case IteratorMode::LINES: - return PyUnicode_FromStringAndSize( - item->data(), static_cast(item->size())); - case IteratorMode::JSON: { - const char *trimmed; - std::size_t trimmed_length; - if (!dftracer::utils::json_trim_and_validate( - item->data(), item->size(), trimmed, trimmed_length)) { - continue; // skip non-JSON delimiter lines - } - PyObject *json_obj = JSON_from_data(trimmed, trimmed_length); - if (!json_obj) { - PyErr_Clear(); - continue; // skip unparseable lines - } - return json_obj; - } - case IteratorMode::RAW: - default: - return PyBytes_FromStringAndSize( - item->data(), static_cast(item->size())); - } + auto dequeued_bytes = dftracer::utils::python::byte_size(*batch_data); + bs->bytes_in_queue.fetch_sub(dequeued_bytes, std::memory_order_acq_rel); + + auto *obj = (MemoryViewBatchObject *)MemoryViewBatchType.tp_alloc( + &MemoryViewBatchType, 0); + if (!obj) return NULL; + obj->data = new MemoryViewBatchData(std::move(*batch_data)); + self->current_batch = (PyObject *)obj; + self->batch_index = 0; } } @@ -317,24 +340,24 @@ PyTypeObject TraceReaderIteratorType = { 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ - "Lazy iterator over TraceReader lines or raw chunks", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - (getiterfunc)TraceReaderIterator_iter, /* tp_iter */ - (iternextfunc)TraceReaderIterator_next, /* tp_iternext */ - 0, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ + "Lazy iterator over TraceReader lines or raw chunks", + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + (getiterfunc)TraceReaderIterator_iter, /* tp_iter */ + (iternextfunc)TraceReaderIterator_next, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ }; int init_trace_reader_iterator(PyObject *m) { diff --git a/src/dftracer/utils/python/trace_reader_iterator.h b/src/dftracer/utils/python/trace_reader_iterator.h index 11941fd8..6f9c4654 100644 --- a/src/dftracer/utils/python/trace_reader_iterator.h +++ b/src/dftracer/utils/python/trace_reader_iterator.h @@ -2,16 +2,14 @@ #define DFTRACER_UTILS_PYTHON_TRACE_READER_ITERATOR_H #include +#include #include +#include +#include -#include -#include #include -#include -#include -#include #include - +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW #include @@ -24,44 +22,70 @@ extern PyTypeObject ArrowBatchCapsuleType; #endif enum class IteratorMode { - LINES, - RAW, - JSON, + MEMORYVIEW, + JSON_DICT, #ifdef DFTRACER_UTILS_ENABLE_ARROW ARROW, #endif }; -struct IteratorState { - std::queue> queue; - std::mutex mtx; - std::condition_variable cv_producer; - std::condition_variable cv_consumer; +#ifdef DFTRACER_UTILS_ENABLE_ARROW +struct ArrowIteratorState { + using BatchType = + dftracer::utils::utilities::common::arrow::ArrowExportResult; + std::shared_ptr> channel; + std::mutex error_mtx; std::exception_ptr error; std::atomic cancelled{false}; - std::atomic done{false}; - std::size_t max_queue_size = 64; + std::size_t memory_budget_bytes = 0; + std::atomic bytes_in_queue{0}; std::shared_future task_future; + + void set_error(std::exception_ptr e) { + std::lock_guard lock(error_mtx); + if (!error) error = e; + } }; +#endif -#ifdef DFTRACER_UTILS_ENABLE_ARROW -struct ArrowIteratorState { - using BatchItem = std::optional< - dftracer::utils::utilities::common::arrow::ArrowExportResult>; - std::queue queue; - std::mutex mtx; - std::condition_variable cv_producer; - std::condition_variable cv_consumer; +using ArgsValue = dftracer::utils::utilities::composites::dft::ArgsValue; +using ArgsMap = dftracer::utils::utilities::composites::dft::ArgsMap; + +struct JsonDictEvent { + ArgsMap top; + ArgsMap args; +}; + +struct JsonDictBatch { + std::vector events; +}; + +struct JsonDictIteratorState { + std::shared_ptr> channel; + std::mutex error_mtx; std::exception_ptr error; std::atomic cancelled{false}; - std::atomic done{false}; - std::size_t max_queue_size = 8; + std::size_t memory_budget_bytes = 0; + std::atomic bytes_in_queue{0}; std::shared_future task_future; + + void set_error(std::exception_ptr e) { + std::lock_guard lock(error_mtx); + if (!error) error = e; + } }; -#endif + +using dftracer::utils::python::MemoryViewBatchIteratorState; +using dftracer::utils::python::MemoryViewBatchObject; +using dftracer::utils::python::MemoryViewBatchType; typedef struct { - PyObject_HEAD std::shared_ptr state; + PyObject_HEAD std::shared_ptr batch_state; + PyObject *current_batch; + Py_ssize_t batch_index; + std::shared_ptr json_dict_state; + std::shared_ptr json_dict_current_batch; + Py_ssize_t json_dict_index; #ifdef DFTRACER_UTILS_ENABLE_ARROW std::shared_ptr arrow_state; #endif diff --git a/src/dftracer/utils/python/utilities/aggregator.cpp b/src/dftracer/utils/python/utilities/aggregator.cpp index 6f7799d4..204f9e49 100644 --- a/src/dftracer/utils/python/utilities/aggregator.cpp +++ b/src/dftracer/utils/python/utilities/aggregator.cpp @@ -1,4 +1,6 @@ #define PY_SSIZE_T_CLEAN +#include +#include #include #include #include @@ -7,9 +9,23 @@ #include #include +#ifdef DFTRACER_UTILS_ENABLE_ARROW +#include +#include +#endif +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +#include +#include +#include +#endif + +#include +#include +#include #include #include +using dftracer::utils::CoroScope; using dftracer::utils::Runtime; using dftracer::utils::coro::CoroTask; using namespace dftracer::utils::utilities::composites::dft::aggregators; @@ -18,8 +34,17 @@ using dftracer::utils::python::wrap_arrow_result; using dftracer::utils::python::wrap_arrow_table; #ifdef DFTRACER_UTILS_ENABLE_ARROW +using dftracer::utils::python::ArrowStreamingIteratorObject; +using dftracer::utils::python::ArrowStreamingIteratorType; +using dftracer::utils::python::StreamingState; using dftracer::utils::utilities::common::arrow::ArrowExportResult; #endif +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +using dftracer::utils::utilities::common::arrow::IpcCompression; +using dftracer::utils::utilities::common::arrow::PartitionWriter; +using dftracer::utils::utilities::common::arrow::PartitionWriteStats; +using dftracer::utils::utilities::common::query::Query; +#endif static Runtime *get_runtime(AggregatorObject *self) { if (self->runtime_obj) @@ -91,8 +116,30 @@ static int parse_str_list(PyObject *obj, std::vector &out, return 0; } +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +// Parse a view query string into an optional Query +static int parse_view_query(PyObject *query_obj, std::optional &out) { + if (!query_obj || query_obj == Py_None) { + out = std::nullopt; + return 0; + } + const char *query_str = PyUnicode_AsUTF8(query_obj); + if (!query_str) return -1; + auto parsed = Query::from_string(query_str); + if (!parsed) { + PyErr_Format(PyExc_ValueError, "Invalid query: %s", + parsed.error().format().c_str()); + return -1; + } + out = std::move(*parsed); + return 0; +} +#endif + static int parse_aggregator_args(PyObject *args, PyObject *kwds, - AggregatorInput &input) { + AggregatorInput &input, + std::size_t *buffer_size_out = nullptr, + std::optional *query_out = nullptr) { static const char *kwlist[] = {"directory", "time_interval_ms", "group_keys", @@ -101,11 +148,12 @@ static int parse_aggregator_args(PyObject *args, PyObject *kwds, "index_dir", "checkpoint_size", "force_rebuild", - "chunk_size_mb", - "batch_size_mb", + "parallelism", "event_batch_size", "custom_metric_fields", "compute_percentiles", + "buffer_size", + "query", NULL}; const char *directory = NULL; @@ -116,28 +164,40 @@ static int parse_aggregator_args(PyObject *args, PyObject *kwds, const char *index_dir = ""; Py_ssize_t checkpoint_size = 32 * 1024 * 1024; int force_rebuild = 0; - Py_ssize_t chunk_size_mb = 64; - Py_ssize_t batch_size_mb = 4; + Py_ssize_t parallelism = 0; Py_ssize_t event_batch_size = 10000; PyObject *custom_metrics_obj = Py_None; int compute_percentiles = 0; + Py_ssize_t buffer_size = 8; + PyObject *query_obj = Py_None; if (!PyArg_ParseTupleAndKeywords( - args, kwds, "s|dOOOsnpnnnOp", (char **)kwlist, &directory, + args, kwds, "s|dOOOsnpnnOpnO", (char **)kwlist, &directory, &time_interval_ms, &group_keys_obj, &categories_obj, &names_obj, - &index_dir, &checkpoint_size, &force_rebuild, &chunk_size_mb, - &batch_size_mb, &event_batch_size, &custom_metrics_obj, - &compute_percentiles)) + &index_dir, &checkpoint_size, &force_rebuild, ¶llelism, + &event_batch_size, &custom_metrics_obj, &compute_percentiles, + &buffer_size, &query_obj)) return -1; + if (buffer_size_out) { + *buffer_size_out = static_cast(buffer_size); + } + +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + if (query_out) { + if (parse_view_query(query_obj, *query_out) < 0) return -1; + } +#else + (void)query_obj; +#endif + input.directory = directory; input.config.time_interval_us = static_cast(time_interval_ms * 1000.0); input.index_dir = index_dir; input.checkpoint_size = static_cast(checkpoint_size); input.force_rebuild = force_rebuild != 0; - input.chunk_size_mb = static_cast(chunk_size_mb); - input.batch_size_mb = static_cast(batch_size_mb); + input.parallelism = static_cast(parallelism); input.event_batch_size = static_cast(event_batch_size); input.config.compute_percentiles = compute_percentiles != 0; @@ -151,23 +211,49 @@ static int parse_aggregator_args(PyObject *args, PyObject *kwds, return 0; } -static int run_aggregator_pipeline(AggregatorObject *self, - const AggregatorInput &input, - std::vector &batches, - std::string &error_msg) { - auto *bp = &batches; +#ifdef DFTRACER_UTILS_ENABLE_ARROW +static int run_aggregator_pipeline( + AggregatorObject *self, const AggregatorInput &input, + std::vector &results, std::string &error_msg, + const std::optional *query = nullptr) { + auto *rp = &results; AggregatorInput input_copy = input; + std::optional query_copy; + if (query) query_copy = *query; Py_BEGIN_ALLOW_THREADS try { Runtime *rt = get_runtime(self); - auto task = [bp, input_copy]() -> CoroTask { - AggregatorUtility util; - auto gen = util.process(input_copy); - while (auto batch = co_await gen.next()) { - bp->push_back(std::move(*batch)); - } - }; - rt->submit(task(), "aggregator").get(); + rt->submit(run_coro_scope( + rt->executor(), + [](CoroScope &scope, std::vector *out, + AggregatorInput input, + std::optional query) -> CoroTask { + AggregatorUtility util; + util.bind_context(scope); + try { + auto gen = util.process(input); + while (auto batch = co_await gen.next()) { + if (batch->entries.empty()) continue; + AggregationBatch filtered; + if (query) { + filtered = batch->filter(*query); + if (filtered.entries.empty()) continue; + } else { + filtered = std::move(*batch); + } + auto arrow_result = filtered.to_arrow(); + if (!arrow_result.valid()) continue; + out->push_back(std::move(arrow_result)); + } + util.unbind_context(); + } catch (...) { + util.unbind_context(); + throw; + } + }, + rp, std::move(input_copy), std::move(query_copy)), + "aggregator") + .get(); } catch (const std::exception &e) { error_msg = e.what(); } @@ -177,38 +263,88 @@ static int run_aggregator_pipeline(AggregatorObject *self, ? 0 : -1; } +#endif // DFTRACER_UTILS_ENABLE_ARROW #ifdef DFTRACER_UTILS_ENABLE_ARROW +static CoroTask run_aggregator_stream( + CoroScope &scope, std::shared_ptr> state, + AggregatorInput input, std::optional query) { + if (state->cancelled()) { + state->complete(); + co_return; + } + + try { + AggregatorUtility util; + util.bind_context(scope); + auto gen = util.process(input); + + while (auto batch = co_await gen.next()) { + if (state->cancelled()) break; + if (batch->entries.empty()) continue; + + AggregationBatch filtered; + if (query) { + filtered = batch->filter(*query); + if (filtered.entries.empty()) continue; + } else { + filtered = std::move(*batch); + } + + auto arrow_result = filtered.to_arrow(); + if (!arrow_result.valid()) continue; + + auto result_bytes = + dftracer::utils::python::byte_size(arrow_result); + if (!state->push(std::move(arrow_result), result_bytes)) { + break; + } + } + + util.unbind_context(); + state->complete(); + } catch (const std::exception &e) { + state->fail(std::current_exception()); + } catch (...) { + state->fail(std::current_exception()); + } +} + #endif // DFTRACER_UTILS_ENABLE_ARROW // --------------------------------------------------------------------------- -// process() — returns ArrowTable (materialized) +// process() - returns ArrowTable (materialized) // --------------------------------------------------------------------------- static PyObject *Aggregator_process(AggregatorObject *self, PyObject *args, PyObject *kwds) { AggregatorInput input; +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + std::optional query; + if (parse_aggregator_args(args, kwds, input, nullptr, &query) < 0) + return NULL; +#else if (parse_aggregator_args(args, kwds, input) < 0) return NULL; +#endif - std::vector batches; +#ifdef DFTRACER_UTILS_ENABLE_ARROW + std::vector results; std::string error_msg; - if (run_aggregator_pipeline(self, input, batches, error_msg) < 0) { +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + if (run_aggregator_pipeline(self, input, results, error_msg, &query) < 0) { +#else + if (run_aggregator_pipeline(self, input, results, error_msg) < 0) { +#endif PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); return NULL; } -#ifdef DFTRACER_UTILS_ENABLE_ARROW PyObject *batch_list = PyList_New(0); if (!batch_list) return NULL; - for (const auto &batch : batches) { - if (batch.entries.empty()) continue; - - auto arrow_result = batch.to_arrow(); - if (!arrow_result.valid()) continue; - - PyObject *cap = wrap_arrow_result(std::move(arrow_result)); + for (auto &result : results) { + PyObject *cap = wrap_arrow_result(std::move(result)); if (!cap) { Py_DECREF(batch_list); return NULL; @@ -230,55 +366,402 @@ static PyObject *Aggregator_process(AggregatorObject *self, PyObject *args, } // --------------------------------------------------------------------------- -// iter_arrow() — returns list iterator of ArrowBatch capsules +// iter_arrow() - returns true streaming iterator // --------------------------------------------------------------------------- static PyObject *Aggregator_iter_arrow(AggregatorObject *self, PyObject *args, PyObject *kwds) { AggregatorInput input; - if (parse_aggregator_args(args, kwds, input) < 0) return NULL; + std::size_t buffer_size = 8; +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + std::optional query; + if (parse_aggregator_args(args, kwds, input, &buffer_size, &query) < 0) + return NULL; +#else + if (parse_aggregator_args(args, kwds, input, &buffer_size) < 0) return NULL; +#endif + +#ifdef DFTRACER_UTILS_ENABLE_ARROW + auto state = std::make_shared>( + dftracer::utils::compute_memory_budget(0)); + + ArrowStreamingIteratorObject *iter_obj = + (ArrowStreamingIteratorObject *)ArrowStreamingIteratorType.tp_new( + &ArrowStreamingIteratorType, NULL, NULL); + if (!iter_obj) { + return NULL; + } + + iter_obj->cpp_state->state = state; + iter_obj->cpp_state->pull_next = + [state]() -> std::optional { return state->pull(); }; + iter_obj->cpp_state->get_error = [state]() -> std::exception_ptr { + return state->error(); + }; + iter_obj->cpp_state->cancel = [state]() { state->cancel(); }; + + Runtime *rt = get_runtime(self); + AggregatorInput input_copy = input; +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + std::optional query_copy = std::move(query); + Py_BEGIN_ALLOW_THREADS rt->submit( + run_coro_scope(rt->executor(), run_aggregator_stream, state, + std::move(input_copy), std::move(query_copy)), + "aggregator_stream"); +#else + Py_BEGIN_ALLOW_THREADS rt->submit( + run_coro_scope(rt->executor(), run_aggregator_stream, state, + std::move(input_copy), std::nullopt), + "aggregator_stream"); +#endif + Py_END_ALLOW_THREADS + + return (PyObject *)iter_obj; +#else + PyErr_SetString(PyExc_RuntimeError, + "dftracer-utils was built without Arrow support"); + return NULL; +#endif +} + +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +struct AggregatorViewDef { + std::string name; + std::optional query; +}; + +struct AggregatorWriteArrowResult { + std::unordered_map view_stats; + int64_t total_rows = 0; + int64_t total_bytes = 0; + std::string error; +}; + +static CoroTask run_aggregator_write_arrow( + CoroScope &scope, AggregatorWriteArrowResult *out, AggregatorInput input, + std::string output_path, std::vector views, + int64_t chunk_size_bytes, IpcCompression compression) { + try { + // If no views specified, create a default "all" view + if (views.empty()) { + views.push_back({"all", std::nullopt}); + } + + // Open a writer for each view + std::vector writers(views.size()); + for (std::size_t i = 0; i < views.size(); ++i) { + std::string view_path = output_path; + if (views.size() > 1 || views[i].name != "all") { + view_path = output_path + "/" + views[i].name; + } + int rc = co_await writers[i].open(view_path, chunk_size_bytes, + compression); + if (rc != 0) { + out->error = "Failed to open writer for view: " + views[i].name; + co_return; + } + } + + AggregatorUtility util; + util.bind_context(scope); + auto gen = util.process(input); + + while (auto batch = co_await gen.next()) { + if (batch->entries.empty()) continue; + + // Write to each view (with optional filtering) + for (std::size_t i = 0; i < views.size(); ++i) { + AggregationBatch filtered_batch; + if (views[i].query) { + filtered_batch = batch->filter(*views[i].query); + if (filtered_batch.entries.empty()) continue; + } else { + filtered_batch = *batch; + } + + auto arrow_result = filtered_batch.to_arrow(); + if (!arrow_result.valid()) continue; + + int rc = co_await writers[i].write_batch(arrow_result); + if (rc != 0) { + util.unbind_context(); + out->error = + "Failed to write batch for view: " + views[i].name; + co_return; + } + } + } + + util.unbind_context(); + + // Close writers and collect stats + for (std::size_t i = 0; i < views.size(); ++i) { + auto stats = co_await writers[i].close(); + out->view_stats[views[i].name] = std::move(stats); + out->total_rows += out->view_stats[views[i].name].total_rows; + out->total_bytes += + out->view_stats[views[i].name].total_uncompressed_bytes; + } + } catch (const std::exception &e) { + out->error = e.what(); + } +} + +static PyObject *Aggregator_write_arrow(AggregatorObject *self, PyObject *args, + PyObject *kwds) { + static const char *kwlist[] = {"directory", + "path", + "time_interval_ms", + "group_keys", + "categories", + "names", + "index_dir", + "checkpoint_size", + "force_rebuild", + "parallelism", + "event_batch_size", + "custom_metric_fields", + "compute_percentiles", + "views", + "chunk_size_mb", + "compression", + NULL}; + + const char *directory = NULL; + const char *output_path = NULL; + double time_interval_ms = 5000.0; + PyObject *group_keys_obj = Py_None; + PyObject *categories_obj = Py_None; + PyObject *names_obj = Py_None; + const char *index_dir = ""; + Py_ssize_t checkpoint_size = 32 * 1024 * 1024; + int force_rebuild = 0; + Py_ssize_t parallelism = 0; + Py_ssize_t event_batch_size = 10000; + PyObject *custom_metrics_obj = Py_None; + int compute_percentiles = 0; + PyObject *views_obj = Py_None; + int chunk_size_mb = 32; + const char *compression_str = "zstd"; + + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "ss|dOOOsnpnnOpOis", (char **)kwlist, &directory, + &output_path, &time_interval_ms, &group_keys_obj, &categories_obj, + &names_obj, &index_dir, &checkpoint_size, &force_rebuild, + ¶llelism, &event_batch_size, &custom_metrics_obj, + &compute_percentiles, &views_obj, &chunk_size_mb, &compression_str)) + return NULL; + + // Parse views + std::vector views; + if (views_obj && views_obj != Py_None) { + if (!PyList_Check(views_obj)) { + PyErr_SetString(PyExc_TypeError, + "views must be a list of dicts with 'name' and " + "optional 'query' keys"); + return NULL; + } + Py_ssize_t n = PyList_Size(views_obj); + for (Py_ssize_t i = 0; i < n; i++) { + PyObject *item = PyList_GetItem(views_obj, i); + if (!PyDict_Check(item)) { + PyErr_SetString(PyExc_TypeError, + "each view must be a dict with 'name' key"); + return NULL; + } + AggregatorViewDef view; + PyObject *name_obj = PyDict_GetItemString(item, "name"); + if (!name_obj) { + PyErr_SetString(PyExc_ValueError, + "each view must have a 'name' key"); + return NULL; + } + const char *name_str = PyUnicode_AsUTF8(name_obj); + if (!name_str) return NULL; + view.name = name_str; + + PyObject *query_obj = PyDict_GetItemString(item, "query"); + if (query_obj && query_obj != Py_None) { + const char *query_str = PyUnicode_AsUTF8(query_obj); + if (!query_str) return NULL; + auto parsed = Query::from_string(query_str); + if (!parsed) { + PyErr_Format(PyExc_ValueError, + "Invalid query for view '%s': %s", name_str, + parsed.error().format().c_str()); + return NULL; + } + view.query = std::move(*parsed); + } + views.push_back(std::move(view)); + } + } + + // Parse compression + IpcCompression compression = IpcCompression::ZSTD; + if (compression_str) { + std::string comp_lower(compression_str); + for (auto &c : comp_lower) c = std::tolower(c); + if (comp_lower == "none") { + compression = IpcCompression::NONE; + } else if (comp_lower == "zstd") { +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + compression = IpcCompression::ZSTD; +#else + PyErr_SetString(PyExc_ValueError, "ZSTD compression not available"); + return NULL; +#endif + } else { + PyErr_Format(PyExc_ValueError, + "Unknown compression: %s (use 'none' or 'zstd')", + compression_str); + return NULL; + } + } + + int64_t chunk_size_bytes = + static_cast(chunk_size_mb) * 1024 * 1024; + + // Parse group_keys + std::vector group_keys; + if (group_keys_obj && group_keys_obj != Py_None) { + if (!PyList_Check(group_keys_obj)) { + PyErr_SetString(PyExc_TypeError, + "group_keys must be a list of str"); + return NULL; + } + Py_ssize_t n = PyList_Size(group_keys_obj); + for (Py_ssize_t i = 0; i < n; i++) { + const char *s = PyUnicode_AsUTF8(PyList_GetItem(group_keys_obj, i)); + if (!s) return NULL; + group_keys.emplace_back(s); + } + } + + // Parse custom_metric_fields + std::vector custom_metrics; + if (custom_metrics_obj && custom_metrics_obj != Py_None) { + if (!PyList_Check(custom_metrics_obj)) { + PyErr_SetString(PyExc_TypeError, + "custom_metric_fields must be a list of str"); + return NULL; + } + Py_ssize_t n = PyList_Size(custom_metrics_obj); + for (Py_ssize_t i = 0; i < n; i++) { + const char *s = + PyUnicode_AsUTF8(PyList_GetItem(custom_metrics_obj, i)); + if (!s) return NULL; + custom_metrics.emplace_back(s); + } + } + + AggregatorInput input; + input.directory = directory; + input.config.time_interval_us = + static_cast(time_interval_ms * 1000.0); + input.config.extra_group_keys = std::move(group_keys); + input.config.custom_metric_fields = std::move(custom_metrics); + input.config.compute_percentiles = compute_percentiles != 0; + input.index_dir = index_dir; + input.checkpoint_size = static_cast(checkpoint_size); + input.force_rebuild = force_rebuild != 0; + input.parallelism = static_cast(parallelism); + input.event_batch_size = static_cast(event_batch_size); - std::vector batches; + std::string output_path_str(output_path); + AggregatorWriteArrowResult result; + auto *rp = &result; std::string error_msg; - if (run_aggregator_pipeline(self, input, batches, error_msg) < 0) { + + Py_BEGIN_ALLOW_THREADS try { + Runtime *rt = get_runtime(self); + rt->submit( + run_coro_scope(rt->executor(), run_aggregator_write_arrow, rp, + std::move(input), output_path_str, + std::move(views), chunk_size_bytes, compression), + "aggregator_write_arrow") + .get(); + } catch (const std::exception &e) { + error_msg = e.what(); + } + Py_END_ALLOW_THREADS + + if (!error_msg.empty()) { PyErr_SetString(PyExc_RuntimeError, error_msg.c_str()); return NULL; } -#ifdef DFTRACER_UTILS_ENABLE_ARROW - PyObject *batch_list = PyList_New(0); - if (!batch_list) return NULL; + if (!result.error.empty()) { + PyErr_SetString(PyExc_RuntimeError, result.error.c_str()); + return NULL; + } - for (const auto &batch : batches) { - if (batch.entries.empty()) continue; + // Build result dict + PyObject *dict = PyDict_New(); + if (!dict) return NULL; - auto arrow_result = batch.to_arrow(); - if (!arrow_result.valid()) continue; + PyObject *views_dict = PyDict_New(); + if (!views_dict) { + Py_DECREF(dict); + return NULL; + } - PyObject *cap = wrap_arrow_result(std::move(arrow_result)); - if (!cap) { - Py_DECREF(batch_list); + for (const auto &[view_name, view_stats] : result.view_stats) { + PyObject *view_dict = PyDict_New(); + if (!view_dict) { + Py_DECREF(views_dict); + Py_DECREF(dict); return NULL; } - int rc = PyList_Append(batch_list, cap); - Py_DECREF(cap); - if (rc < 0) { - Py_DECREF(batch_list); + PyObject *files_list = PyList_New(0); + if (!files_list) { + Py_DECREF(view_dict); + Py_DECREF(views_dict); + Py_DECREF(dict); return NULL; } + + for (const auto &f : view_stats.files) { + PyObject *file_str = PyUnicode_FromString(f.c_str()); + if (!file_str || PyList_Append(files_list, file_str) < 0) { + Py_XDECREF(file_str); + Py_DECREF(files_list); + Py_DECREF(view_dict); + Py_DECREF(views_dict); + Py_DECREF(dict); + return NULL; + } + Py_DECREF(file_str); + } + + PyDict_SetItemString(view_dict, "files", files_list); + PyDict_SetItemString(view_dict, "rows", + PyLong_FromLongLong(view_stats.total_rows)); + PyDict_SetItemString( + view_dict, "bytes", + PyLong_FromLongLong(view_stats.total_uncompressed_bytes)); + Py_DECREF(files_list); + + PyObject *key = PyUnicode_FromString(view_name.c_str()); + PyDict_SetItem(views_dict, key, view_dict); + Py_DECREF(key); + Py_DECREF(view_dict); } - PyObject *it = PyObject_GetIter(batch_list); - Py_DECREF(batch_list); - return it; -#else - PyErr_SetString(PyExc_RuntimeError, - "dftracer-utils was built without Arrow support"); - return NULL; -#endif + PyDict_SetItemString(dict, "views", views_dict); + PyDict_SetItemString(dict, "total_rows", + PyLong_FromLongLong(result.total_rows)); + PyDict_SetItemString(dict, "total_bytes", + PyLong_FromLongLong(result.total_bytes)); + Py_DECREF(views_dict); + + return dict; } +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC + static PyObject *Aggregator_call(PyObject *self, PyObject *args, PyObject *kwds) { return Aggregator_process((AggregatorObject *)self, args, kwds); @@ -289,12 +772,14 @@ static PyMethodDef Aggregator_methods[] = { "process(directory, time_interval_ms=5000.0, group_keys=None,\n" " categories=None, names=None, index_dir='',\n" " checkpoint_size=33554432, force_rebuild=False,\n" - " chunk_size_mb=64, batch_size_mb=4, event_batch_size=10000,\n" + " parallelism=0, event_batch_size=10000,\n" " custom_metric_fields=None, compute_percentiles=False)\n" "--\n" "\n" "Run aggregation pipeline, return materialized ArrowTable.\n" "\n" + "Uses parallel, RocksDB-backed, fused indexing and aggregation.\n" + "\n" "Args:\n" " directory (str): Directory containing .pfw/.pfw.gz files.\n" " time_interval_ms (float): Time bucket in milliseconds (default " @@ -305,12 +790,11 @@ static PyMethodDef Aggregator_methods[] = { " index_dir (str): Directory for .dftindex stores (default '').\n" " checkpoint_size (int): Checkpoint size (default 33554432).\n" " force_rebuild (bool): Force index rebuild (default False).\n" - " chunk_size_mb (int): Target chunk size in MB (default 64).\n" - " batch_size_mb (int): Batch read size in MB (default 4).\n" + " parallelism (int): Number of parallel workers (0 = all cores).\n" " event_batch_size (int): Entries per batch (default 10000).\n" " custom_metric_fields (list[str] or None): Extra numeric args\n" - " fields to aggregate into *_total/*_min/*_max/*_mean/*_std\n" - " columns (default None).\n" + " fields to aggregate into ``*_total``/``*_min``/``*_max``/\n" + " ``*_mean``/``*_std`` columns (default None).\n" " compute_percentiles (bool): Enable percentile sketch collection\n" " during aggregation (default False).\n" "\n" @@ -321,12 +805,19 @@ static PyMethodDef Aggregator_methods[] = { "iter_arrow(directory, time_interval_ms=5000.0, group_keys=None,\n" " categories=None, names=None, index_dir='',\n" " checkpoint_size=33554432, force_rebuild=False,\n" - " chunk_size_mb=64, batch_size_mb=4, event_batch_size=10000,\n" - " custom_metric_fields=None, compute_percentiles=False)\n" + " parallelism=0, event_batch_size=10000,\n" + " custom_metric_fields=None, compute_percentiles=False,\n" + " buffer_size=8)\n" "--\n" "\n" "Run aggregation pipeline, stream Arrow batches.\n" "\n" + "Returns immediately with a streaming iterator. Batches are produced\n" + "in the background with a bounded buffer. GIL is released while waiting\n" + "for the next batch, allowing other Python threads to run.\n" + "\n" + "Uses parallel, RocksDB-backed, fused indexing and aggregation.\n" + "\n" "Args:\n" " directory (str): Directory containing .pfw/.pfw.gz files.\n" " time_interval_ms (float): Time bucket in milliseconds (default " @@ -337,17 +828,56 @@ static PyMethodDef Aggregator_methods[] = { " index_dir (str): Directory for .dftindex stores (default '').\n" " checkpoint_size (int): Checkpoint size (default 33554432).\n" " force_rebuild (bool): Force index rebuild (default False).\n" - " chunk_size_mb (int): Target chunk size in MB (default 64).\n" - " batch_size_mb (int): Batch read size in MB (default 4).\n" + " parallelism (int): Number of parallel workers (0 = all cores).\n" " event_batch_size (int): Entries per batch (default 10000).\n" " custom_metric_fields (list[str] or None): Extra numeric args\n" - " fields to aggregate into *_total/*_min/*_max/*_mean/*_std\n" - " columns (default None).\n" + " fields to aggregate into ``*_total``/``*_min``/``*_max``/\n" + " ``*_mean``/``*_std`` columns (default None).\n" " compute_percentiles (bool): Enable percentile sketch collection\n" " during aggregation (default False).\n" + " buffer_size (int): Max batches to buffer (default 8).\n" + "\n" + "Returns:\n" + " _ArrowStreamingIterator: Streaming iterator yielding Arrow record\n" + " batches. Supports cancel() to stop early.\n"}, +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + {"write_arrow", (PyCFunction)Aggregator_write_arrow, + METH_VARARGS | METH_KEYWORDS, + "write_arrow(directory, path, time_interval_ms=5000.0, ..., views=None)\n" + "--\n" + "\n" + "Run aggregation and write results to Arrow IPC files with optional " + "views.\n" + "\n" + "Views allow filtering aggregated entries before writing. Each view\n" + "writes to a separate subdirectory. Query syntax supports: cat, name,\n" + "pid, tid, hhash, fhash, time_bucket, extra group keys, and aggregation\n" + "metrics (count, dur_total, dur_min, dur_max, size_total, etc.).\n" + "\n" + "Args:\n" + " directory (str): Directory containing .pfw/.pfw.gz files.\n" + " path (str): Output directory for Arrow files.\n" + " time_interval_ms (float): Time bucket in milliseconds.\n" + " group_keys (list[str] or None): Extra grouping dims.\n" + " categories (list[str] or None): Category filter.\n" + " names (list[str] or None): Name filter.\n" + " index_dir (str): Directory for .dftindex stores.\n" + " checkpoint_size (int): Checkpoint size.\n" + " force_rebuild (bool): Force index rebuild.\n" + " parallelism (int): Number of parallel workers.\n" + " event_batch_size (int): Entries per batch.\n" + " custom_metric_fields (list[str] or None): Extra numeric fields.\n" + " compute_percentiles (bool): Enable percentile collection.\n" + " views (list[dict] or None): View definitions, each with 'name' and\n" + " optional 'query' keys. If None, writes all entries to path.\n" + " Example: [{'name': 'io', 'query': 'cat == \"POSIX\"'}]\n" + " chunk_size_mb (int): Max uncompressed MB per file (default 32).\n" + " compression (str): 'zstd' or 'none' (default 'zstd').\n" "\n" "Returns:\n" - " Iterator[ArrowBatch]: Arrow record batches.\n"}, + " dict: Statistics with 'views' (per-view stats), 'total_rows',\n" + " 'total_bytes'. Each view has 'files', 'rows', 'bytes'.\n"}, +#endif {NULL}}; PyTypeObject AggregatorType = { diff --git a/src/dftracer/utils/python/utilities/comparator.cpp b/src/dftracer/utils/python/utilities/comparator.cpp index c377fac7..27ead4bb 100644 --- a/src/dftracer/utils/python/utilities/comparator.cpp +++ b/src/dftracer/utils/python/utilities/comparator.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -29,8 +30,6 @@ #include #include #include -#include -#include #include using dftracer::utils::Runtime; @@ -40,6 +39,7 @@ using namespace dftracer::utils::utilities; using namespace dftracer::utils::utilities::composites::dft::aggregators; using namespace dftracer::utils::utilities::composites::dft::comparator; +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW using dftracer::utils::python::arrow_result_to_table; using dftracer::utils::utilities::common::arrow::ArrowExportResult; @@ -108,17 +108,27 @@ struct ComparatorArgs { double time_interval_ms = 5000.0; double threshold = 0.0; std::size_t executor_threads = 0; - std::string index_dir; + std::string baseline_index_dir; + std::string variant_index_dir; bool force_rebuild = false; std::string config_path; }; static int parse_comparator_args(PyObject *args, PyObject *kwds, ComparatorArgs &out) { - static const char *kwlist[] = { - "baseline", "variant", "query", "group_by", - "format", "time_interval_ms", "threshold", "executor_threads", - "index_dir", "force_rebuild", "config", NULL}; + static const char *kwlist[] = {"baseline", + "variant", + "query", + "group_by", + "format", + "time_interval_ms", + "threshold", + "executor_threads", + "baseline_index_dir", + "variant_index_dir", + "force_rebuild", + "config", + NULL}; const char *baseline = NULL; const char *variant = NULL; @@ -128,14 +138,16 @@ static int parse_comparator_args(PyObject *args, PyObject *kwds, double time_interval_ms = 5000.0; double threshold = 0.0; Py_ssize_t executor_threads = 0; - const char *index_dir = ""; + const char *baseline_index_dir = ""; + const char *variant_index_dir = ""; int force_rebuild = 0; const char *config = ""; if (!PyArg_ParseTupleAndKeywords( - args, kwds, "ss|sssddnsps", (char **)kwlist, &baseline, &variant, + args, kwds, "ss|sssddnssps", (char **)kwlist, &baseline, &variant, &query, &group_by, &format, &time_interval_ms, &threshold, - &executor_threads, &index_dir, &force_rebuild, &config)) + &executor_threads, &baseline_index_dir, &variant_index_dir, + &force_rebuild, &config)) return -1; out.baseline = baseline; @@ -146,7 +158,8 @@ static int parse_comparator_args(PyObject *args, PyObject *kwds, out.time_interval_ms = time_interval_ms; out.threshold = threshold; out.executor_threads = static_cast(executor_threads); - out.index_dir = index_dir; + out.baseline_index_dir = baseline_index_dir; + out.variant_index_dir = variant_index_dir; out.force_rebuild = force_rebuild != 0; out.config_path = config; @@ -163,7 +176,7 @@ void flatten_nodes(const ComparisonNode &node, } } -CoroTask run_aggregation( +CoroTask run_aggregation( std::vector input_files, AggregationConfig agg_config, std::optional query, std::string index_dir, std::size_t checkpoint_size, bool force_rebuild, @@ -177,7 +190,7 @@ CoroTask run_aggregation( .with_watchdog(false); Pipeline pipeline(pipeline_config); - EventAggregatorUtility merger; + EventAggregator merger; std::atomic global_chunk_idx{0}; auto streaming_task = make_task( @@ -274,7 +287,7 @@ CoroTask run_aggregation( }, "StreamingAggregate"); - EventAggregatorUtilityOutput result; + EventAggregatorOutput result; auto post_task = make_task( [&](CoroScope & /*ctx*/) -> CoroTask { result = merger.finalize(); @@ -320,8 +333,10 @@ static int run_comparison_pipeline(ComparatorObject *self, config.no_color = true; if (args_copy.executor_threads > 0) config.executor_threads = args_copy.executor_threads; - if (!args_copy.index_dir.empty()) - config.index_dir = args_copy.index_dir; + if (!args_copy.baseline_index_dir.empty()) + config.baseline_index_dir = args_copy.baseline_index_dir; + if (!args_copy.variant_index_dir.empty()) + config.variant_index_dir = args_copy.variant_index_dir; if (args_copy.force_rebuild) config.force_rebuild = args_copy.force_rebuild; if (args_copy.threshold > 0.0) @@ -339,45 +354,87 @@ static int run_comparison_pipeline(ComparatorObject *self, indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE; } - // Create temp index dir if needed - std::string temp_index_dir; - if (config.index_dir.empty()) { - auto temp_path = fs::temp_directory_path(); - temp_path /= - "dftracer_cmp_py_" + std::to_string(std::time(nullptr)) + "_" + - std::to_string(static_cast( - std::hash{}(std::this_thread::get_id()))); - temp_index_dir = temp_path.string(); - fs::create_directories(temp_index_dir); - config.index_dir = temp_index_dir; - } - - // Enumerate files - auto enumerate_files = - [](std::string path) -> CoroTask> { - std::vector files; - if (fs::is_regular_file(path)) { - files.push_back(path); - co_return files; - } - filesystem::PatternDirectoryScannerUtility scanner; - filesystem::PatternDirectoryScannerUtilityInput scan_input{ - path, {".pfw", ".pfw.gz"}, false}; - auto entries = co_await scanner.process(scan_input); - files.reserve(entries.size()); - for (const auto &e : entries) { - files.push_back(e.path.string()); - } - co_return files; - }; + using composites::dft::indexing::IndexResolverUtility; + using composites::dft::indexing::ResolverInput; + using indexer::IndexBatchBuilderUtility; + using indexer::IndexBuildBatchConfig; Runtime *rt = get_runtime(self); auto *error_msg_ptr = &error_msg; - auto task = [config, output_ptr, enumerate_files, - error_msg_ptr]() -> CoroTask { - auto baseline_files = co_await enumerate_files(config.baseline); - auto variant_files = co_await enumerate_files(config.variant); + auto task = [config, output_ptr, error_msg_ptr, + rt]() -> CoroTask { + auto resolve_and_build = + [&config]( + CoroScope &scope, const std::string &path, + const std::string &index_dir, + std::vector &out_files) -> CoroTask { + IndexResolverUtility resolver; + ResolverInput resolve_input; + resolve_input.index_dir = index_dir; + resolve_input.require_checkpoints = !config.force_rebuild; + if (fs::is_regular_file(path)) { + resolve_input.files = {path}; + } else { + resolve_input.directory = path; + } + + auto result = co_await resolver.process(resolve_input); + out_files = std::move(result.all_files); + + if (out_files.empty() || result.needs_checkpoint.empty()) { + co_return; + } + + auto batch_cfg = std::make_shared(); + batch_cfg->file_paths.reserve(result.needs_checkpoint.size()); + for (const auto &item : result.needs_checkpoint) { + batch_cfg->file_paths.push_back(item.file_path); + } + batch_cfg->index_dir = index_dir; + batch_cfg->checkpoint_size = config.checkpoint_size; + batch_cfg->parallelism = config.executor_threads; + batch_cfg->force_rebuild = config.force_rebuild; + batch_cfg->use_batch_write = true; + batch_cfg->rebuild_root_summaries = true; + + co_await IndexBatchBuilderUtility::process( + &scope, std::move(batch_cfg)); + }; + + std::vector baseline_files; + std::vector variant_files; + + bool shared_index = + composites::dft::internal::determine_index_path( + config.baseline, config.baseline_index_dir) == + composites::dft::internal::determine_index_path( + config.variant, config.variant_index_dir); + + co_await run_coro_scope( + rt->executor(), [&](CoroScope &scope) -> CoroTask { + if (shared_index) { + co_await resolve_and_build(scope, config.baseline, + config.baseline_index_dir, + baseline_files); + if (config.baseline == config.variant) { + variant_files = baseline_files; + } else { + co_await resolve_and_build(scope, config.variant, + config.variant_index_dir, + variant_files); + } + } else { + scope.spawn([&](CoroScope &s) -> CoroTask { + co_await resolve_and_build( + s, config.baseline, config.baseline_index_dir, + baseline_files); + }); + co_await resolve_and_build(scope, config.variant, + config.variant_index_dir, + variant_files); + } + }); if (baseline_files.empty()) { *error_msg_ptr = @@ -390,42 +447,6 @@ static int run_comparison_pipeline(ComparatorObject *self, co_return; } - // Build indexes upfront - { - if (config.force_rebuild && !baseline_files.empty()) { - const std::string shared_index_path = - composites::dft::internal::determine_index_path( - baseline_files.front(), config.index_dir); - if (fs::exists(shared_index_path)) { - fs::remove_all(shared_index_path); - } - } - std::unordered_set seen; - std::vector all_files; - for (const auto &f : baseline_files) { - if (seen.insert(f).second) all_files.push_back(f); - } - for (const auto &f : variant_files) { - if (seen.insert(f).second) all_files.push_back(f); - } - std::vector idx_configs; - idx_configs.reserve(all_files.size()); - for (const auto &file_path : all_files) { - idx_configs.push_back( - indexer::IndexBuildConfig::for_file(file_path) - .with_checkpoint_size(config.checkpoint_size) - .with_force_rebuild(false) - .with_index_dir(config.index_dir)); - } - std::vector> idx_tasks; - idx_tasks.reserve(idx_configs.size()); - for (const auto &cfg : idx_configs) { - idx_tasks.push_back( - indexer::IndexBuilderUtility{}.process(cfg)); - } - co_await coro::when_all(std::move(idx_tasks)); - } - output_ptr->baseline_path = config.baseline; output_ptr->variant_path = config.variant; output_ptr->baseline_file_count = baseline_files.size(); @@ -466,13 +487,13 @@ static int run_comparison_pipeline(ComparatorObject *self, auto [base_result, var_result] = co_await coro::when_all( run_aggregation( - baseline_files, agg_cfg, query, config.index_dir, - config.checkpoint_size, config.force_rebuild, - config.executor_threads), + baseline_files, agg_cfg, query, + config.baseline_index_dir, config.checkpoint_size, + config.force_rebuild, config.executor_threads), run_aggregation( - variant_files, agg_cfg, query, config.index_dir, - config.checkpoint_size, config.force_rebuild, - config.executor_threads)); + variant_files, agg_cfg, query, + config.variant_index_dir, config.checkpoint_size, + config.force_rebuild, config.executor_threads)); if (pairs.empty()) { output_ptr->baseline_meta = extract_metadata( @@ -514,11 +535,6 @@ static int run_comparison_pipeline(ComparatorObject *self, }; rt->submit(task(), "comparator").get(); - - // Clean up temp index dir - if (!temp_index_dir.empty() && fs::exists(temp_index_dir)) { - fs::remove_all(temp_index_dir); - } } catch (const std::exception &e) { error_msg = e.what(); } diff --git a/src/dftracer/utils/python/utilities/reorganization_planner.cpp b/src/dftracer/utils/python/utilities/reorganization_planner.cpp index 929bfe79..5178087d 100644 --- a/src/dftracer/utils/python/utilities/reorganization_planner.cpp +++ b/src/dftracer/utils/python/utilities/reorganization_planner.cpp @@ -1,4 +1,7 @@ #include +#include +#include +#include #include #include #include @@ -6,7 +9,11 @@ #include #include +using dftracer::utils::CoroScope; using dftracer::utils::Runtime; +using dftracer::utils::utilities::behaviors::BehaviorChain; +using dftracer::utils::utilities::behaviors::UtilityExecutor; +namespace tags = dftracer::utils::utilities::tags; using namespace dftracer::utils::utilities::composites::dft::reorganize; static Runtime *get_runtime(ReorganizationPlannerObject *self) { @@ -129,11 +136,17 @@ static PyObject *ReorganizationPlanner_plan(ReorganizationPlannerObject *self, Py_BEGIN_ALLOW_THREADS try { Runtime *rt = get_runtime(self); - auto task = [plan_p, input_copy]() -> CoroTask { - ReorganizationPlannerUtility util; - *plan_p = co_await util.process(input_copy); - }; - rt->submit(task(), "reorganization-planner").get(); + auto task = run_coro_scope( + rt->executor(), + [plan_p, input_copy](CoroScope &scope) -> CoroTask { + auto planner = std::make_shared(); + UtilityExecutor + exec(planner, BehaviorChain{}); + *plan_p = co_await exec.execute_with_context(scope, input_copy); + }); + rt->submit(std::move(task), "reorganization-planner").wait(); } catch (const std::exception &e) { error_msg = e.what(); } diff --git a/src/dftracer/utils/server/cursor.cpp b/src/dftracer/utils/server/cursor.cpp index cfcefd42..990a9aa5 100644 --- a/src/dftracer/utils/server/cursor.cpp +++ b/src/dftracer/utils/server/cursor.cpp @@ -8,7 +8,7 @@ namespace dftracer::utils::server { namespace { // Minimal base64 encode/decode for cursor serialization. -static constexpr char kBase64Chars[] = +static constexpr char BASE64_CHARS[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; std::string base64_encode(const void* data, std::size_t len) { @@ -19,10 +19,10 @@ std::string base64_encode(const void* data, std::size_t len) { unsigned val = static_cast(bytes[i]) << 16; if (i + 1 < len) val |= static_cast(bytes[i + 1]) << 8; if (i + 2 < len) val |= static_cast(bytes[i + 2]); - out.push_back(kBase64Chars[(val >> 18) & 0x3F]); - out.push_back(kBase64Chars[(val >> 12) & 0x3F]); - out.push_back((i + 1 < len) ? kBase64Chars[(val >> 6) & 0x3F] : '='); - out.push_back((i + 2 < len) ? kBase64Chars[val & 0x3F] : '='); + out.push_back(BASE64_CHARS[(val >> 18) & 0x3F]); + out.push_back(BASE64_CHARS[(val >> 12) & 0x3F]); + out.push_back((i + 1 < len) ? BASE64_CHARS[(val >> 6) & 0x3F] : '='); + out.push_back((i + 2 < len) ? BASE64_CHARS[val & 0x3F] : '='); } return out; } diff --git a/src/dftracer/utils/server/trace_api.cpp b/src/dftracer/utils/server/trace_api.cpp index 1b8f3ca0..668d83d5 100644 --- a/src/dftracer/utils/server/trace_api.cpp +++ b/src/dftracer/utils/server/trace_api.cpp @@ -13,14 +13,15 @@ #include #include #include +#include #include +#include #include #include #include #include #include #include -#include #include #include @@ -76,105 +77,6 @@ static const std::unordered_set HASH_METADATA_NAMES = {"FH", "HH", using dftracer::utils::utilities::common::json::JsonDocGuard; using dftracer::utils::utilities::common::query::Query; -/// Direct-scan a small file without any `.dftindex` store. -/// Streams via async_streaming_gz_lines(), parses JSON, applies -/// predicate filters, collects matching events as raw JSON strings. -static coro::CoroTask direct_scan_events( - const TraceIndex::FileInfo* file_info, const Query* query, - bool include_metadata, std::vector* collected_events, - std::uint64_t* total_scanned, std::uint64_t* total_matched, int limit) { - using dftracer::utils::utilities::fileio::lines::sources:: - async_streaming_gz_lines; - - try { - auto gen = async_streaming_gz_lines(file_info->path); - - std::unordered_map pending_metadata; - std::unordered_set emitted_hashes; - - while (auto line = co_await gen.next()) { - if (limit > 0 && - collected_events->size() >= static_cast(limit)) { - co_return; - } - if (line->content.empty()) continue; - - JsonDocGuard guard{yyjson_read_opts( - const_cast(line->content.data()), line->content.size(), - YYJSON_READ_NOFLAG, nullptr, nullptr)}; - if (!guard.doc) continue; - - yyjson_val* root = yyjson_doc_get_root(guard.doc); - if (root && yyjson_is_obj(root)) { - JsonValue json(root); - // line->content is a string_view valid only for this - // iteration. All storage into collected_events and - // pending_metadata must copy to owning std::string. - std::string_view ph = json["ph"].get(); - - if (ph == "M" && include_metadata) { - std::string name_str = json["name"].get(); - - if (HASH_METADATA_NAMES.count(name_str)) { - auto args = json["args"]; - if (args.exists()) { - auto val = args["value"]; - if (val.exists()) { - std::string hash_val = val.get(); - if (!emitted_hashes.count(hash_val)) { - pending_metadata[hash_val] = - std::string(line->content.data(), - line->content.size()); - } - } - } - } else { - collected_events->emplace_back(line->content.data(), - line->content.size()); - (*total_matched)++; - } - } else if (ph != "M") { - (*total_scanned)++; - if (!query || query->evaluate(json)) { - // Flush referenced hash metadata first - if (include_metadata) { - auto args = json["args"]; - if (args.exists()) { - static const char* hash_fields[] = { - "hhash", "fhash", "shash"}; - for (const char* field : hash_fields) { - auto val = args[field]; - if (!val.exists()) continue; - std::string hash_val = - val.get(); - if (emitted_hashes.count(hash_val)) - continue; - auto it = pending_metadata.find(hash_val); - if (it != pending_metadata.end()) { - collected_events->push_back( - std::move(it->second)); - (*total_matched)++; - emitted_hashes.insert(hash_val); - pending_metadata.erase(it); - } - } - } - } - collected_events->emplace_back(line->content.data(), - line->content.size()); - (*total_matched)++; - } - } - } - } - } catch (const std::exception& e) { - DFTRACER_UTILS_LOG_WARN("Direct scan failed for %s: %s", - file_info->path.c_str(), e.what()); - } - - co_return; -} - // --- GET /api/v1/files --- static coro::CoroTask handle_files(const HttpRequest& /*req*/, const QueryParams& /*params*/, @@ -192,8 +94,6 @@ static coro::CoroTask handle_files(const HttpRequest& /*req*/, body += f.has_bloom_data ? "true" : "false"; body += ",\"has_checkpoint_index\":"; body += f.has_checkpoint_index ? "true" : "false"; - body += ",\"is_small\":"; - body += f.is_small ? "true" : "false"; body += '}'; } body += "],\"count\":"; @@ -225,21 +125,16 @@ static coro::CoroTask handle_file_info(const HttpRequest& /*req*/, body += info->has_bloom_data ? "true" : "false"; body += ",\"has_checkpoint_index\":"; body += info->has_checkpoint_index ? "true" : "false"; - body += ",\"is_small\":"; - body += info->is_small ? "true" : "false"; - body += ",\"size_mb\":"; body += std::to_string(info->size_mb); body += ",\"compressed_size\":"; body += std::to_string(info->compressed_size); - if (!info->is_small) { - body += ",\"num_lines\":"; - body += std::to_string(info->num_lines); - body += ",\"num_checkpoints\":"; - body += std::to_string(info->num_checkpoints); - body += ",\"uncompressed_size\":"; - body += std::to_string(info->uncompressed_size); - } + body += ",\"num_lines\":"; + body += std::to_string(info->num_lines); + body += ",\"num_checkpoints\":"; + body += std::to_string(info->num_checkpoints); + body += ",\"uncompressed_size\":"; + body += std::to_string(info->uncompressed_size); body += '}'; co_return HttpResponse::ok(body); @@ -356,10 +251,6 @@ static std::vector resolve_target_files( std::vector filtered; filtered.reserve(files.size()); for (auto* fi : files) { - if (fi->is_small) { - filtered.push_back(fi); - continue; - } if (fi->min_timestamp_us == 0 && fi->max_timestamp_us == 0) { filtered.push_back(fi); continue; @@ -379,33 +270,13 @@ using StreamChunk = HttpResponse::StreamChunk; static coro::AsyncGenerator stream_events( std::vector files, ViewDefinition ev_view, - std::optional query_opt, double ts_min, double ts_max, + std::optional /*query_opt*/, double ts_min, double ts_max, BloomFilterCache* bloom_cache, int limit) { int emitted = 0; - const Query* query_ptr = query_opt ? &*query_opt : nullptr; for (auto* file_info : files) { if (limit > 0 && emitted >= limit) break; - if (file_info->is_small) { - std::vector events; - std::uint64_t scanned = 0; - std::uint64_t matched = 0; - co_await direct_scan_events( - file_info, query_ptr, ev_view.include_metadata, &events, - &scanned, &matched, limit > 0 ? limit - emitted : 0); - std::vector views; - for (const auto& event : events) { - if (limit > 0 && emitted >= limit) break; - views.push_back(event); - emitted++; - } - if (!views.empty()) { - co_yield StreamChunk{views}; - } - continue; - } - if (file_info->uncompressed_size == 0 && file_info->num_checkpoints == 0) continue; @@ -514,78 +385,54 @@ static coro::CoroTask handle_stats(const HttpRequest& req, } std::vector all_stats; - std::size_t skipped_small = 0; - std::vector stat_files; + // Group files by index_path + std::unordered_map>> + files_by_index; + std::size_t file_idx = 0; for (const auto& file_info : index.files()) { - if (file_info.is_small) { - skipped_small++; - continue; - } if (!file_info.has_bloom_data) continue; - stat_files.push_back(&file_info); + files_by_index[file_info.index_path].emplace_back(file_idx++, + file_info.path); } - if (stat_files.size() <= 1) { - for (auto* file_info : stat_files) { - StatisticsAggregatorInput agg_input; - agg_input.file_path = file_info->path; - agg_input.index_path = file_info->index_path; - agg_input.index_dir = index.index_dir(); - - StatisticsAggregatorUtility aggregator; - auto stats = co_await aggregator.process(agg_input); - if (stats.success) { - all_stats.push_back(std::move(stats)); - } + // Resolve each group and read statistics + for (auto& [idx_path, files] : files_by_index) { + std::vector file_paths; + file_paths.reserve(files.size()); + for (const auto& [_, path] : files) { + file_paths.push_back(path); } - } else { - std::size_t num_workers = - std::min(index.max_concurrent(), stat_files.size()); - auto* executor = Executor::current(); - - auto file_chan = coro::make_channel(num_workers * 2); - auto stats_mutex = std::make_shared(); - auto* all_stats_ptr = &all_stats; - auto* stat_files_ptr = &stat_files; - std::string index_dir = index.index_dir(); - const auto* index_dir_ptr = &index_dir; - - CoroScope scope(executor); - - scope.spawn([ch = file_chan->producer(), stat_files_ptr]( - CoroScope&) mutable -> coro::CoroTask { - auto guard = ch.guard(); - for (std::size_t i = 0; i < stat_files_ptr->size(); ++i) { - if (!co_await ch.send(i)) co_return; - } - co_return; - }); - for (std::size_t w = 0; w < num_workers; ++w) { - scope.spawn([file_chan, stat_files_ptr, stats_mutex, all_stats_ptr, - index_dir_ptr](CoroScope&) -> coro::CoroTask { - while (auto fi_opt = co_await file_chan->receive()) { - auto* file_info = (*stat_files_ptr)[*fi_opt]; + IndexResolverUtility resolver; + ResolverInput input; + input.files = std::move(file_paths); + input.require_checkpoints = false; - StatisticsAggregatorInput agg_input; - agg_input.file_path = file_info->path; - agg_input.index_path = file_info->index_path; - agg_input.index_dir = *index_dir_ptr; + auto result = co_await resolver.process(input); - StatisticsAggregatorUtility aggregator; - auto stats = co_await aggregator.process(agg_input); + if (result.cached.empty()) { + continue; + } - if (stats.success) { - std::lock_guard lock(*stats_mutex); - all_stats_ptr->push_back(std::move(stats)); - } + try { + SharedIndexStatisticsReader reader; + auto batch_rows = co_await reader.query( + result.index_path, std::move(result.cached), + StatisticsQueryType::SUMMARY); + auto callback = [&all_stats](std::size_t /*file_index*/, + TraceStatistics&& stats) { + if (stats.success) { + all_stats.push_back(std::move(stats)); } - co_return; - }); + }; + SharedIndexStatisticsReader::process_batch_results(batch_rows, + callback); + } catch (const std::exception& e) { + DFTRACER_UTILS_LOG_WARN("Server stats batch read failed for %s: %s", + idx_path.c_str(), e.what()); } - - co_await scope.join(); } std::uint64_t total_events = 0; @@ -600,8 +447,6 @@ static coro::CoroTask handle_stats(const HttpRequest& req, body += std::to_string(file_count); body += ",\"total_events\":"; body += std::to_string(total_events); - body += ",\"skipped_small_files\":"; - body += std::to_string(skipped_small); body += ",\"files\":["; for (std::size_t i = 0; i < all_stats.size(); ++i) { if (i > 0) body += ','; @@ -650,8 +495,6 @@ static coro::CoroTask handle_info(const HttpRequest& /*req*/, body += f.has_bloom_data ? "true" : "false"; body += ",\"has_checkpoint_index\":"; body += f.has_checkpoint_index ? "true" : "false"; - body += ",\"is_small\":"; - body += f.is_small ? "true" : "false"; if (f.min_timestamp_us > 0 || f.max_timestamp_us > 0) { body += ",\"min_timestamp_us\":"; body += std::to_string(f.min_timestamp_us); diff --git a/src/dftracer/utils/server/trace_index.cpp b/src/dftracer/utils/server/trace_index.cpp index 9ccde5d3..6744995e 100644 --- a/src/dftracer/utils/server/trace_index.cpp +++ b/src/dftracer/utils/server/trace_index.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -46,7 +45,6 @@ coro::CoroTask TraceIndex::initialize() { std::vector needs_build; std::vector large_files; - std::size_t small_count = 0; for (const auto& entry : entries) { FileInfo info; @@ -56,38 +54,21 @@ coro::CoroTask TraceIndex::initialize() { std::error_code ec; auto fsize = fs::file_size(info.path, ec); info.compressed_size = (!ec && fsize > 0) ? fsize : 0; - info.is_small = info.compressed_size > 0 && - info.compressed_size < INDEX_SIZE_THRESHOLD; std::size_t idx = files_.size(); path_to_index_[info.path] = idx; - if (info.is_small) { - info.has_bloom_data = false; - info.has_checkpoint_index = false; - info.size_mb = - static_cast(info.compressed_size) / (1024.0 * 1024.0); - small_count++; + info.has_bloom_data = fs::exists(info.index_path); + info.has_checkpoint_index = fs::exists(info.index_path); + if (!info.has_bloom_data) { + needs_build.push_back(idx); } else { - info.has_bloom_data = fs::exists(info.index_path); - info.has_checkpoint_index = fs::exists(info.index_path); - if (!info.has_bloom_data) { - needs_build.push_back(idx); - } else { - large_files.push_back(idx); - } + large_files.push_back(idx); } files_.push_back(std::move(info)); } - if (small_count > 0) { - DFTRACER_UTILS_LOG_INFO( - "TraceIndex: %zu small file(s) (< %zu bytes) will be " - "streamed directly (no .dftindex database)", - small_count, INDEX_SIZE_THRESHOLD); - } - if (!needs_build.empty() || !large_files.empty()) { auto pipeline_config = PipelineConfig() @@ -122,7 +103,7 @@ coro::CoroTask TraceIndex::initialize() { coro::make_channel(max_concurrent * 2); const auto* index_dir_ptr = &index_dir; - co_await ctx.scope([file_chan, files_ptr, needs_build_ptr, + co_await ctx.scope([&file_chan, files_ptr, needs_build_ptr, index_dir_ptr, max_concurrent](CoroScope& scope) -> coro::CoroTask { @@ -137,41 +118,38 @@ coro::CoroTask TraceIndex::initialize() { }); for (std::size_t w = 0; w < max_concurrent; ++w) { - scope.spawn( - [file_chan, files_ptr, index_dir_ptr]( - CoroScope&) -> coro::CoroTask { - while (auto fi_opt = - co_await file_chan->receive()) { - std::size_t fi = *fi_opt; - auto* info = &(*files_ptr)[fi]; - - indexer::IndexBuilderUtility builder; - auto config = - indexer::IndexBuildConfig::for_file( - info->path) - .with_index_dir(*index_dir_ptr) - .with_bloom(true) - .with_index_threshold(0); - auto result = - co_await builder.process(config); - - if (result.success) { - info->index_path = - internal::determine_index_path( - info->path, *index_dir_ptr); - info->has_bloom_data = true; - info->has_checkpoint_index = - fs::exists(info->index_path); - } else { - DFTRACER_UTILS_LOG_WARN( - "TraceIndex: failed to " - "index %s: %s", - info->path.c_str(), - result.error_message.c_str()); - } + scope.spawn([ch = file_chan->consumer(), files_ptr, + index_dir_ptr](CoroScope&) + -> coro::CoroTask { + while (auto fi_opt = co_await ch.receive()) { + std::size_t fi = *fi_opt; + auto* info = &(*files_ptr)[fi]; + + indexer::IndexBuilderUtility builder; + auto config = + indexer::IndexBuildConfig::for_file( + info->path) + .with_index_dir(*index_dir_ptr); + auto result = + co_await builder.process(config); + + if (result.success) { + info->index_path = + internal::determine_index_path( + info->path, *index_dir_ptr); + info->has_bloom_data = true; + info->has_checkpoint_index = + fs::exists(info->index_path); + } else { + DFTRACER_UTILS_LOG_WARN( + "TraceIndex: failed to " + "index %s: %s", + info->path.c_str(), + result.error_message.c_str()); } - co_return; - }); + } + co_return; + }); } co_return; }); @@ -187,7 +165,7 @@ coro::CoroTask TraceIndex::initialize() { auto meta_chan = coro::make_channel(max_concurrent * 2); - co_await ctx.scope([meta_chan, files_ptr, large_files_ptr, + co_await ctx.scope([&meta_chan, files_ptr, large_files_ptr, max_concurrent](CoroScope& scope) -> coro::CoroTask { scope.spawn( @@ -201,45 +179,31 @@ coro::CoroTask TraceIndex::initialize() { }); for (std::size_t w = 0; w < max_concurrent; ++w) { - scope.spawn([meta_chan, files_ptr](CoroScope&) + scope.spawn([ch = meta_chan->consumer(), + files_ptr](CoroScope&) -> coro::CoroTask { - while (auto fi_opt = - co_await meta_chan->receive()) { + while (auto fi_opt = co_await ch.receive()) { std::size_t fi = *fi_opt; auto* info = &(*files_ptr)[fi]; if (info->has_bloom_data) { try { - const std::string path = info->path; - const std::string index_path = - info->index_path; - const auto* path_ptr = &path; - const auto* index_path_ptr = - &index_path; - auto bounds = co_await rocksdb::run( - [path_ptr, index_path_ptr] { - indexer::IndexDatabase - idx_db(*index_path_ptr); - auto logical = - indexer::internal:: - get_logical_path( - *path_ptr); - int fid = - idx_db.get_file_info_id( - logical); - if (fid < 0) { - return indexer:: - IndexDatabase:: - TimeBounds{}; - } - return idx_db - .query_time_bounds(fid); - }); - if (bounds.valid) { - info->min_timestamp_us = - bounds.min_timestamp_us; - info->max_timestamp_us = - bounds.max_timestamp_us; + indexer::IndexDatabase idx_db( + info->index_path); + auto logical = indexer::internal:: + get_logical_path(info->path); + int fid = idx_db.get_file_info_id( + logical); + if (fid >= 0) { + auto bounds = + idx_db.query_time_bounds( + fid); + if (bounds.valid) { + info->min_timestamp_us = + bounds.min_timestamp_us; + info->max_timestamp_us = + bounds.max_timestamp_us; + } } } catch (const std::exception& e) { DFTRACER_UTILS_LOG_WARN( diff --git a/src/dftracer/utils/server/viz_api.cpp b/src/dftracer/utils/server/viz_api.cpp index 9917c765..99bf5329 100644 --- a/src/dftracer/utils/server/viz_api.cpp +++ b/src/dftracer/utils/server/viz_api.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -44,38 +44,45 @@ static const std::unordered_set HASH_METADATA_NAMES = {"FH", "HH", /// the original string on parse failure. static std::string normalize_event_ts(const std::string& event_json, std::uint64_t offset) { - auto* doc = yyjson_read(event_json.c_str(), event_json.size(), 0); - if (!doc) return event_json; - - auto* mdoc = yyjson_doc_mut_copy(doc, nullptr); - yyjson_doc_free(doc); - if (!mdoc) return event_json; - - auto* root = yyjson_mut_doc_get_root(mdoc); - if (root) { - auto* ts_val = yyjson_mut_obj_get(root, "ts"); - if (ts_val && yyjson_mut_is_uint(ts_val)) { - std::uint64_t old_ts = yyjson_mut_get_uint(ts_val); - std::uint64_t new_ts = old_ts >= offset ? old_ts - offset : 0; - yyjson_mut_set_uint(ts_val, new_ts); - } else if (ts_val && yyjson_mut_is_int(ts_val)) { - auto old_ts = - static_cast(yyjson_mut_get_int(ts_val)); - std::uint64_t new_ts = old_ts >= offset ? old_ts - offset : 0; - yyjson_mut_set_uint(ts_val, new_ts); - } + thread_local simdjson::dom::parser tl_parser; + auto result = tl_parser.parse(event_json); + if (result.error()) return event_json; + + auto root = result.value_unsafe(); + if (!root.is_object()) return event_json; + + auto ts_result = root["ts"]; + if (ts_result.error()) return event_json; + + std::uint64_t old_ts = 0; + if (ts_result.is_uint64()) { + old_ts = ts_result.get_uint64().value_unsafe(); + } else if (ts_result.is_int64()) { + auto val = ts_result.get_int64().value_unsafe(); + old_ts = val >= 0 ? static_cast(val) : 0; + } else { + return event_json; } - std::size_t len = 0; - char* json_str = yyjson_mut_write(mdoc, YYJSON_WRITE_NOFLAG, &len); - yyjson_mut_doc_free(mdoc); + std::uint64_t new_ts = old_ts >= offset ? old_ts - offset : 0; + + // simdjson DOM is read-only, so we need to rebuild the JSON with the new ts + // Find "ts": and replace the value + std::string modified = event_json; + auto pos = modified.find("\"ts\":"); + if (pos == std::string::npos) return event_json; - if (json_str) { - std::string result(json_str, len); - free(json_str); - return result; + pos += 5; // Skip past "ts": + while (pos < modified.size() && std::isspace(modified[pos])) ++pos; + + auto end_pos = pos; + while (end_pos < modified.size() && + (std::isdigit(modified[end_pos]) || modified[end_pos] == '-')) { + ++end_pos; } - return event_json; + + modified.replace(pos, end_pos - pos, std::to_string(new_ts)); + return modified; } /// Compute the minimum event duration threshold for a given summary level. @@ -88,10 +95,16 @@ static double duration_threshold(double begin, double end, unsigned level, (static_cast(viewport_width) * static_cast(level)); } -static std::string extract_json_value(yyjson_val* val) { - if (yyjson_is_str(val)) return yyjson_get_str(val); - if (yyjson_is_int(val)) return std::to_string(yyjson_get_int(val)); - if (yyjson_is_uint(val)) return std::to_string(yyjson_get_uint(val)); +static std::string extract_json_value(simdjson::dom::element val) { + if (val.is_string()) { + return std::string(val.get_string().value_unsafe()); + } + if (val.is_int64()) { + return std::to_string(val.get_int64().value_unsafe()); + } + if (val.is_uint64()) { + return std::to_string(val.get_uint64().value_unsafe()); + } return {}; } @@ -111,38 +124,41 @@ static void append_lane_clause(std::string& dsl, const char* field, static void apply_lanes(std::string& dsl, std::string_view lanes_str) { if (lanes_str.empty()) return; - std::string buf(lanes_str); - auto* doc = yyjson_read(buf.c_str(), buf.size(), 0); - if (!doc) return; - auto doc_guard = std::unique_ptr( - doc, yyjson_doc_free); - - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root) return; - - if (yyjson_is_arr(root)) { - yyjson_val* item; - yyjson_arr_iter iter; - yyjson_arr_iter_init(root, &iter); - while ((item = yyjson_arr_iter_next(&iter)) != nullptr) { - if (!yyjson_is_obj(item)) continue; - auto* field_val = yyjson_obj_get(item, "field"); - if (!field_val) field_val = yyjson_obj_get(item, "fields"); - auto* value_val = yyjson_obj_get(item, "value"); - if (!field_val || !value_val) continue; - const char* field = yyjson_get_str(field_val); - if (!field) continue; - auto val = extract_json_value(value_val); + thread_local simdjson::dom::parser tl_parser; + auto result = tl_parser.parse(lanes_str.data(), lanes_str.size()); + if (result.error()) return; + + auto root = result.value_unsafe(); + + if (root.is_array()) { + auto arr = root.get_array().value_unsafe(); + for (auto item : arr) { + if (!item.is_object()) continue; + auto obj = item.get_object().value_unsafe(); + + auto field_result = obj["field"]; + if (field_result.error()) field_result = obj["fields"]; + auto value_result = obj["value"]; + if (field_result.error() || value_result.error()) continue; + + if (!field_result.value_unsafe().is_string()) continue; + const char* field = + field_result.value_unsafe().get_c_str().value_unsafe(); + auto val = extract_json_value(value_result.value_unsafe()); if (!val.empty()) append_lane_clause(dsl, field, val); } - } else if (yyjson_is_obj(root)) { - auto* field_val = yyjson_obj_get(root, "field"); - if (!field_val) field_val = yyjson_obj_get(root, "fields"); - auto* value_val = yyjson_obj_get(root, "value"); - if (field_val && value_val) { - const char* field = yyjson_get_str(field_val); - if (field) { - auto val = extract_json_value(value_val); + } else if (root.is_object()) { + auto obj = root.get_object().value_unsafe(); + + auto field_result = obj["field"]; + if (field_result.error()) field_result = obj["fields"]; + auto value_result = obj["value"]; + + if (!field_result.error() && !value_result.error()) { + if (field_result.value_unsafe().is_string()) { + const char* field = + field_result.value_unsafe().get_c_str().value_unsafe(); + auto val = extract_json_value(value_result.value_unsafe()); if (!val.empty()) append_lane_clause(dsl, field, val); } } @@ -152,31 +168,33 @@ static void apply_lanes(std::string& dsl, std::string_view lanes_str) { static void apply_filters(std::string& dsl, std::string_view filters_str) { if (filters_str.empty()) return; - std::string buf(filters_str); - auto* doc = yyjson_read(buf.c_str(), buf.size(), 0); - if (!doc) return; - auto doc_guard = std::unique_ptr( - doc, yyjson_doc_free); + thread_local simdjson::dom::parser tl_parser; + auto result = tl_parser.parse(filters_str.data(), filters_str.size()); + if (result.error()) return; - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_arr(root)) return; + auto root = result.value_unsafe(); + if (!root.is_array()) return; - yyjson_val* item; - yyjson_arr_iter iter; - yyjson_arr_iter_init(root, &iter); - while ((item = yyjson_arr_iter_next(&iter)) != nullptr) { - if (!yyjson_is_obj(item)) continue; + auto arr = root.get_array().value_unsafe(); + for (auto item : arr) { + if (!item.is_object()) continue; + auto obj = item.get_object().value_unsafe(); - auto* field_val = yyjson_obj_get(item, "field"); - auto* op_val = yyjson_obj_get(item, "op"); - auto* value_val = yyjson_obj_get(item, "value"); - if (!field_val || !op_val || !value_val) continue; + auto field_result = obj["field"]; + auto op_result = obj["op"]; + auto value_result = obj["value"]; + if (field_result.error() || op_result.error() || value_result.error()) + continue; + + if (!field_result.value_unsafe().is_string() || + !op_result.value_unsafe().is_string()) + continue; - const char* field = yyjson_get_str(field_val); - const char* op = yyjson_get_str(op_val); - if (!field || !op) continue; + const char* field = + field_result.value_unsafe().get_c_str().value_unsafe(); + const char* op = op_result.value_unsafe().get_c_str().value_unsafe(); - std::string val = extract_json_value(value_val); + std::string val = extract_json_value(value_result.value_unsafe()); if (val.empty()) continue; std::string op_str(op); @@ -209,105 +227,6 @@ static void apply_filters(std::string& dsl, std::string_view filters_str) { } } -/// Direct-scan a small file without any `.dftindex` store. -/// Streams via async_streaming_gz_lines(), parses JSON, applies -/// predicate filters, collects matching events as raw JSON strings. -static coro::CoroTask direct_scan_events( - const TraceIndex::FileInfo* file_info, const Query* query, - bool include_metadata, std::vector* collected_events, - std::uint64_t* total_scanned, std::uint64_t* total_matched, int limit) { - using dftracer::utils::utilities::fileio::lines::sources:: - async_streaming_gz_lines; - - try { - auto gen = async_streaming_gz_lines(file_info->path); - - std::unordered_map pending_metadata; - std::unordered_set emitted_hashes; - - while (auto line = co_await gen.next()) { - if (limit > 0 && - collected_events->size() >= static_cast(limit)) { - co_return; - } - if (line->content.empty()) continue; - - JsonDocGuard guard{yyjson_read_opts( - const_cast(line->content.data()), line->content.size(), - YYJSON_READ_NOFLAG, nullptr, nullptr)}; - if (!guard.doc) continue; - - yyjson_val* root = yyjson_doc_get_root(guard.doc); - if (root && yyjson_is_obj(root)) { - JsonValue json(root); - // line->content is a string_view valid only for this - // iteration. All storage into collected_events and - // pending_metadata must copy to owning std::string. - std::string_view ph = json["ph"].get(); - - if (ph == "M" && include_metadata) { - std::string name_str = json["name"].get(); - - if (HASH_METADATA_NAMES.count(name_str)) { - auto args = json["args"]; - if (args.exists()) { - auto val = args["value"]; - if (val.exists()) { - std::string hash_val = val.get(); - if (!emitted_hashes.count(hash_val)) { - pending_metadata[hash_val] = - std::string(line->content.data(), - line->content.size()); - } - } - } - } else { - collected_events->emplace_back(line->content.data(), - line->content.size()); - (*total_matched)++; - } - } else if (ph != "M") { - (*total_scanned)++; - if (!query || query->evaluate(json)) { - // Flush referenced hash metadata first - if (include_metadata) { - auto args = json["args"]; - if (args.exists()) { - static const char* hash_fields[] = { - "hhash", "fhash", "shash"}; - for (const char* field : hash_fields) { - auto val = args[field]; - if (!val.exists()) continue; - std::string hash_val = - val.get(); - if (emitted_hashes.count(hash_val)) - continue; - auto it = pending_metadata.find(hash_val); - if (it != pending_metadata.end()) { - collected_events->push_back( - std::move(it->second)); - (*total_matched)++; - emitted_hashes.insert(hash_val); - pending_metadata.erase(it); - } - } - } - } - collected_events->emplace_back(line->content.data(), - line->content.size()); - (*total_matched)++; - } - } - } - } - } catch (const std::exception& e) { - DFTRACER_UTILS_LOG_WARN("Direct scan failed for %s: %s", - file_info->path.c_str(), e.what()); - } - - co_return; -} - // --- GET /api/v1/viz/events --- static coro::CoroTask handle_viz_events( const HttpRequest& /*req*/, const QueryParams& params, TraceIndex& index) { @@ -401,10 +320,6 @@ static coro::CoroTask handle_viz_events( std::vector filtered; filtered.reserve(target_files.size()); for (auto* fi : target_files) { - if (fi->is_small) { - filtered.push_back(fi); - continue; - } if (fi->min_timestamp_us == 0 && fi->max_timestamp_us == 0) { filtered.push_back(fi); continue; @@ -417,8 +332,6 @@ static coro::CoroTask handle_viz_events( target_files = std::move(filtered); } - const Query* viz_query_ptr = view.query ? &*view.query : nullptr; - std::vector collected_events; bool truncated = false; @@ -430,63 +343,50 @@ static coro::CoroTask handle_viz_events( truncated = true; break; } - if (file_info->is_small) { - std::uint64_t scanned = 0; - std::uint64_t matched = 0; - co_await direct_scan_events( - file_info, viz_query_ptr, view.include_metadata, - &collected_events, &scanned, &matched, limit); + if (file_info->uncompressed_size == 0 && + file_info->num_checkpoints == 0) + continue; + + ViewBuilderInput builder_input; + builder_input.with_view(view) + .with_file_path(file_info->path) + .with_index_path( + file_info->has_bloom_data ? file_info->index_path : "") + .with_uncompressed_size(file_info->uncompressed_size) + .with_num_checkpoints(file_info->num_checkpoints) + .with_bloom_cache(&index.bloom_cache()) + .with_time_range(begin, end); + + ViewBuilderUtility builder; + auto build_output = co_await builder.process(builder_input); + if (!build_output.success || !build_output.file_may_match) continue; + + for (const auto& candidate : build_output.candidates) { if (limit > 0 && - static_cast(collected_events.size()) >= limit) + static_cast(collected_events.size()) >= limit) { truncated = true; - } else { - if (file_info->uncompressed_size == 0 && - file_info->num_checkpoints == 0) - continue; - - ViewBuilderInput builder_input; - builder_input.with_view(view) - .with_file_path(file_info->path) - .with_index_path( - file_info->has_bloom_data ? file_info->index_path : "") - .with_uncompressed_size(file_info->uncompressed_size) - .with_num_checkpoints(file_info->num_checkpoints) - .with_bloom_cache(&index.bloom_cache()) - .with_time_range(begin, end); - - ViewBuilderUtility builder; - auto build_output = co_await builder.process(builder_input); - if (!build_output.success || !build_output.file_may_match) - continue; - - for (const auto& candidate : build_output.candidates) { - if (limit > 0 && - static_cast(collected_events.size()) >= limit) { - truncated = true; - break; - } - ViewReaderInput reader_input; - reader_input.with_file_path(file_info->path) - .with_index_path(file_info->index_path) - .with_byte_range(candidate.start_byte, - candidate.end_byte) - .with_checkpoint_idx(candidate.checkpoint_idx) - .with_view(view); - - ViewReaderUtility reader; - auto gen = reader.process(reader_input); - while (auto batch = co_await gen.next()) { - for (auto& event : batch->events) { - if (limit > 0 && - static_cast(collected_events.size()) >= - limit) { - truncated = true; - break; - } - collected_events.emplace_back(event); + break; + } + ViewReaderInput reader_input; + reader_input.with_file_path(file_info->path) + .with_index_path(file_info->index_path) + .with_byte_range(candidate.start_byte, candidate.end_byte) + .with_checkpoint_idx(candidate.checkpoint_idx) + .with_view(view); + + ViewReaderUtility reader; + auto gen = reader.process(reader_input); + while (auto batch = co_await gen.next()) { + for (auto& event : batch->events) { + if (limit > 0 && + static_cast(collected_events.size()) >= + limit) { + truncated = true; + break; } - if (truncated) break; + collected_events.emplace_back(event); } + if (truncated) break; } } } @@ -520,81 +420,56 @@ static coro::CoroTask handle_viz_events( for (std::size_t w = 0; w < num_workers; ++w) { scope.spawn([file_chan, target_files_ptr, collected_mutex, - collected_ptr, viz_query_ptr, view_ptr, - bloom_cache_ptr, remaining, t_begin, - t_end](CoroScope&) -> coro::CoroTask { + collected_ptr, view_ptr, bloom_cache_ptr, remaining, + t_begin, t_end](CoroScope&) -> coro::CoroTask { while (auto fi_opt = co_await file_chan->receive()) { if (remaining->load(std::memory_order_relaxed) <= 0) co_return; auto* file_info = (*target_files_ptr)[*fi_opt]; - if (file_info->is_small) { - std::vector local_events; - std::uint64_t local_scanned = 0; - std::uint64_t local_matched = 0; - int local_limit = - remaining->load(std::memory_order_relaxed); - if (local_limit <= 0) co_return; - co_await direct_scan_events( - file_info, viz_query_ptr, - view_ptr->include_metadata, &local_events, - &local_scanned, &local_matched, local_limit); - if (!local_events.empty()) { - std::lock_guard lock(*collected_mutex); - for (auto& ev : local_events) { - collected_ptr->push_back(std::move(ev)); - } - remaining->fetch_sub( - static_cast(local_events.size())); - } - } else { - if (file_info->uncompressed_size == 0 && - file_info->num_checkpoints == 0) - continue; - - ViewBuilderInput builder_input; - builder_input.with_view(*view_ptr) - .with_file_path(file_info->path) - .with_index_path(file_info->has_bloom_data - ? file_info->index_path - : "") - .with_uncompressed_size( - file_info->uncompressed_size) - .with_num_checkpoints(file_info->num_checkpoints) - .with_bloom_cache(bloom_cache_ptr) - .with_time_range(t_begin, t_end); - - ViewBuilderUtility builder; - auto build_output = - co_await builder.process(builder_input); - if (!build_output.success || - !build_output.file_may_match) - continue; - - for (const auto& candidate : build_output.candidates) { - if (remaining->load(std::memory_order_relaxed) <= 0) - break; - - ViewReaderInput reader_input; - reader_input.with_file_path(file_info->path) - .with_index_path(file_info->index_path) - .with_byte_range(candidate.start_byte, - candidate.end_byte) - .with_checkpoint_idx(candidate.checkpoint_idx) - .with_view(*view_ptr); - - ViewReaderUtility reader; - auto gen = reader.process(reader_input); - while (auto batch = co_await gen.next()) { - if (!batch->events.empty()) { - std::lock_guard lock( - *collected_mutex); - for (auto& event : batch->events) { - collected_ptr->emplace_back(event); - } - remaining->fetch_sub( - static_cast(batch->events.size())); + if (file_info->uncompressed_size == 0 && + file_info->num_checkpoints == 0) + continue; + + ViewBuilderInput builder_input; + builder_input.with_view(*view_ptr) + .with_file_path(file_info->path) + .with_index_path(file_info->has_bloom_data + ? file_info->index_path + : "") + .with_uncompressed_size(file_info->uncompressed_size) + .with_num_checkpoints(file_info->num_checkpoints) + .with_bloom_cache(bloom_cache_ptr) + .with_time_range(t_begin, t_end); + + ViewBuilderUtility builder; + auto build_output = co_await builder.process(builder_input); + if (!build_output.success || !build_output.file_may_match) + continue; + + for (const auto& candidate : build_output.candidates) { + if (remaining->load(std::memory_order_relaxed) <= 0) + break; + + ViewReaderInput reader_input; + reader_input.with_file_path(file_info->path) + .with_index_path(file_info->index_path) + .with_byte_range(candidate.start_byte, + candidate.end_byte) + .with_checkpoint_idx(candidate.checkpoint_idx) + .with_view(*view_ptr); + + ViewReaderUtility reader; + auto gen = reader.process(reader_input); + while (auto batch = co_await gen.next()) { + if (!batch->events.empty()) { + std::lock_guard lock( + *collected_mutex); + for (auto& event : batch->events) { + collected_ptr->emplace_back(event); } + remaining->fetch_sub( + static_cast(batch->events.size())); } } } diff --git a/src/dftracer/utils/utilities/call_tree/call_tree.cpp b/src/dftracer/utils/utilities/call_tree/call_tree.cpp index 767af2fb..78b8620d 100644 --- a/src/dftracer/utils/utilities/call_tree/call_tree.cpp +++ b/src/dftracer/utils/utilities/call_tree/call_tree.cpp @@ -2,35 +2,61 @@ #include #include #include -#include #include #include -#include #include #include -#include -#include -#include -#include -#include #include -#include namespace dftracer::utils::call_tree { namespace internal { -/** - * Internal implementation class (PIMPL pattern) - * Hides complex CallTree internals from public API - */ +namespace { + +std::unordered_map args_to_string_map( + const ArgsMap& args) { + std::unordered_map out; + args.for_each_member( + [&](std::string_view k, + dftracer::utils::utilities::composites::dft::ArgsValueProxy v) { + std::string val; + if (v.is_string()) + val = v.get(); + else if (v.is_int()) + val = std::to_string(v.get()); + else if (v.is_uint()) + val = std::to_string(v.get()); + else if (v.is_number()) + val = std::to_string(v.get()); + else if (v.is_bool()) + val = v.get() ? "true" : "false"; + out.emplace(std::string(k), std::move(val)); + }); + return out; +} + +void fill_node_info(const CallTreeNode& node, CallTreeNodeInfo& info) { + info.id = node.get_id(); + info.name = std::string(node.get_name()); + info.category = std::string(node.get_category()); + info.start_time_us = node.get_start_time(); + info.duration_us = node.get_duration(); + info.level = node.get_level(); + info.parent_id = node.get_parent_id(); + info.num_children = node.get_children().size(); + info.children_ids = node.get_children(); + info.args = args_to_string_map(node.get_args()); +} + +} // namespace + class CallTreeImpl { public: CallTree graph; std::vector trace_files; std::string trace_directory; - std::string output_path; bool is_generated; CallTreeImpl() : is_generated(false) { graph.initialize(); } @@ -99,58 +125,12 @@ class CallTreeImpl { } const auto& node = it->second; - - // Add current node CallTreeNodeInfo info; - info.id = node->get_id(); - info.name = node->get_name(); - info.category = node->get_category(); - info.start_time_us = node->get_start_time(); - info.duration_us = node->get_duration(); - info.level = node->get_level(); - info.parent_id = node->get_parent_id(); - info.num_children = node->get_children().size(); - info.children_ids = node->get_children(); - info.args = node->get_args(); - - nodes.push_back(info); - - // Recursively traverse children - for (std::uint64_t child_id : node->get_children()) { - traverse_depth_first(process_graph, child_id, nodes); - } - } - - void print_node_recursive(const ProcessCallTree& process_graph, - std::uint64_t node_id, int indent, int max_depth, - std::ostream& out) const { - if (max_depth > 0 && indent >= max_depth) { - return; - } - - auto it = process_graph.calls.find(node_id); - if (it == process_graph.calls.end()) { - return; - } - - const auto& node = it->second; - - // Print indentation - for (int i = 0; i < indent; i++) { - out << " "; - } - - // Print node info - out << node->get_name() << " [" << node->get_category() << "] " - << "level=" << node->get_level() << " " - << "dur=" << (static_cast(node->get_duration()) / 1000.0) - << "ms " - << "children=" << node->get_children().size() << "\n"; + fill_node_info(*node, info); + nodes.push_back(std::move(info)); - // Print children for (std::uint64_t child_id : node->get_children()) { - print_node_recursive(process_graph, child_id, indent + 1, max_depth, - out); + traverse_depth_first(process_graph, child_id, nodes); } } @@ -174,9 +154,11 @@ class CallTreeImpl { } // Print node info - printf("%s [%s] level=%d dur=%.3fms children=%zu\n", - node->get_name().c_str(), node->get_category().c_str(), - node->get_level(), + auto nm = node->get_name(); + auto ct = node->get_category(); + printf("%.*s [%.*s] level=%d dur=%.3fms children=%zu\n", + static_cast(nm.size()), nm.data(), + static_cast(ct.size()), ct.data(), node->get_level(), static_cast(node->get_duration()) / 1000.0, node->get_children().size()); @@ -243,11 +225,6 @@ bool CallTree::load_from_directory(const std::string& trace_dir, if (found) { DFTRACER_UTILS_LOG_INFO("Found %zu trace files in %s", impl_->trace_files.size(), trace_dir.c_str()); - - // Set default output path - fs::path dir_path(trace_dir); - std::string dir_name = dir_path.filename().string(); - impl_->output_path = dir_name + ".calltree"; } return found; @@ -305,44 +282,6 @@ void CallTree::print_depth_first(int max_depth) const { } } -bool CallTree::print_depth_first_to_file(const std::string& filename, - int max_depth) const { - if (!impl_->is_generated) { - DFTRACER_UTILS_LOG_ERROR( - "%s", "Call tree not generated. Call generate() first."); - return false; - } - - std::ofstream file(filename); - if (!file.is_open()) { - DFTRACER_UTILS_LOG_ERROR("Cannot open file for writing: %s", - filename.c_str()); - return false; - } - - auto keys = impl_->graph.keys(); - - for (const auto& key : keys) { - auto* process_graph = impl_->graph.get(key); - if (!process_graph) continue; - - file << "\n=== Process/Thread: PID=" << key.pid << ", TID=" << key.tid - << ", Node=" << key.node_id << " ===" << std::endl; - file << "Total nodes: " << process_graph->calls.size() << std::endl; - file << "Root calls: " << process_graph->root_calls.size() << std::endl; - file << std::endl; - - for (std::uint64_t root_id : process_graph->root_calls) { - impl_->print_node_recursive(*process_graph, root_id, 0, max_depth, - file); - } - } - - file.close(); - DFTRACER_UTILS_LOG_INFO("Call tree printed to: %s", filename.c_str()); - return true; -} - std::vector CallTree::get_nodes_depth_first() const { std::vector all_nodes; @@ -366,241 +305,6 @@ std::vector CallTree::get_nodes_depth_first() const { return all_nodes; } -std::string CallTree::get_output_path() const { return impl_->output_path; } - -void CallTree::set_output_path(const std::string& path) { - impl_->output_path = path; -} - -bool CallTree::save_to_file(const std::string& filename) const { - if (!impl_->is_generated) { - DFTRACER_UTILS_LOG_ERROR( - "%s", "Call tree not generated. Call generate() first."); - return false; - } - - std::string output_file = filename.empty() ? impl_->output_path : filename; - - std::ofstream file(output_file, std::ios::binary); - if (!file.is_open()) { - DFTRACER_UTILS_LOG_ERROR("Cannot open file for writing: %s", - output_file.c_str()); - return false; - } - - // Write header - const char magic[8] = {'C', 'A', 'L', 'L', 'T', 'R', 'E', 'E'}; - file.write(magic, 8); - - std::uint32_t version = 1; - file.write(reinterpret_cast(&version), sizeof(version)); - - // Get nodes in depth-first order - auto nodes = get_nodes_depth_first(); - - std::uint64_t num_nodes = nodes.size(); - file.write(reinterpret_cast(&num_nodes), sizeof(num_nodes)); - - // Write each node - for (const auto& node : nodes) { - file.write(reinterpret_cast(&node.id), sizeof(node.id)); - - std::uint32_t name_len = static_cast(node.name.size()); - file.write(reinterpret_cast(&name_len), sizeof(name_len)); - file.write(node.name.data(), name_len); - - std::uint32_t cat_len = - static_cast(node.category.size()); - file.write(reinterpret_cast(&cat_len), sizeof(cat_len)); - file.write(node.category.data(), cat_len); - - file.write(reinterpret_cast(&node.start_time_us), - sizeof(node.start_time_us)); - file.write(reinterpret_cast(&node.duration_us), - sizeof(node.duration_us)); - file.write(reinterpret_cast(&node.level), - sizeof(node.level)); - file.write(reinterpret_cast(&node.parent_id), - sizeof(node.parent_id)); - - std::uint64_t num_children = node.num_children; - file.write(reinterpret_cast(&num_children), - sizeof(num_children)); - } - - file.close(); - DFTRACER_UTILS_LOG_INFO("Call tree saved to: %s", output_file.c_str()); - DFTRACER_UTILS_LOG_INFO(" Nodes written: %zu", nodes.size()); - - return true; -} - -bool CallTree::save_to_json(const std::string& filename) const { - if (!impl_->is_generated) { - DFTRACER_UTILS_LOG_ERROR( - "%s", "Call tree not generated. Call generate() first."); - return false; - } - - // Determine output file - use .pfw extension for compatibility with - // DFTracer tools - std::string output_file = filename; - if (output_file.empty()) { - // Replace .calltree extension with .pfw if present, otherwise append - std::string base = impl_->output_path; - if (base.size() >= 9 && base.substr(base.size() - 9) == ".calltree") { - base = base.substr(0, base.size() - 9); - } - output_file = base + ".pfw"; - } - - std::ofstream file(output_file); - if (!file.is_open()) { - DFTRACER_UTILS_LOG_ERROR("Cannot open file for writing: %s", - output_file.c_str()); - return false; - } - - DFTRACER_UTILS_LOG_INFO( - "%s", "Serializing call tree to JSON (Chrome Tracing format)..."); - - // Create JSON serializer - internal::JsonSerializer serializer; - - // Buffer for serialization (16KB should be enough for most events) - const size_t BUFFER_SIZE = 16384; - char buffer[BUFFER_SIZE]; - - // Get hostname for identification - char hostname[256]; - gethostname(hostname, sizeof(hostname)); - std::string hostname_hash = std::string(hostname); - - // Write opening bracket - size_t written = serializer.initialize(buffer, hostname_hash); - file.write(buffer, written); - - // Write metadata events for file header - std::time_t now = std::time(nullptr); - char timestamp[256]; - std::strftime(timestamp, sizeof(timestamp), "%Y-%m-%d %H:%M:%S", - std::localtime(&now)); - - written = serializer.serialize_metadata(buffer, "timestamp", timestamp, "M", - 0, 0, true); - file.write(buffer, written - 1); // Don't write the newline yet - file.write(",\n", 2); // Write comma separator - - written = serializer.serialize_metadata(buffer, "format", "call_tree", "M", - 0, 0, true); - file.write(buffer, written - 1); - file.write(",\n", 2); - - // Get all process keys - auto keys = impl_->graph.keys(); - - // Track event index (similar to DFTracer) - int event_index = 0; - size_t total_events = 0; - - // Iterate over all processes/threads - for (const auto& key : keys) { - auto* process_graph = impl_->graph.get(key); - if (!process_graph) continue; - - // Traverse and serialize nodes in depth-first order - for (std::uint64_t root_id : process_graph->root_calls) { - std::vector stack; - stack.push_back(root_id); - - while (!stack.empty()) { - std::uint64_t node_id = stack.back(); - stack.pop_back(); - - auto it = process_graph->calls.find(node_id); - if (it == process_graph->calls.end()) continue; - - const auto& node = it->second; - - // Serialize this node - written = serializer.serialize_node(buffer, event_index++, - *node, key.pid, key.tid); - - // Write to file with comma separator (except last event) - file.write(buffer, written - 1); // Don't write newline - - // Add children to stack in reverse order for depth-first - const auto& children = node->get_children(); - for (auto child_it = children.rbegin(); - child_it != children.rend(); ++child_it) { - stack.push_back(*child_it); - } - - // Write comma separator for next event - file.write(",\n", 2); - total_events++; - } - } - } - - // Write closing bracket (overwrites the last comma) - file.seekp(-2, std::ios::cur); // Back up over ",\n" - file.write("\n", 1); // Just write newline - - written = serializer.finalize(buffer, true); - file.write(buffer, written); - - file.close(); - - DFTRACER_UTILS_LOG_INFO("Call tree saved to JSON: %s", output_file.c_str()); - DFTRACER_UTILS_LOG_INFO(" Total events: %zu", total_events); - DFTRACER_UTILS_LOG_INFO(" Unique processes: %zu", keys.size()); - DFTRACER_UTILS_LOG_INFO( - "%s", " Format: Chrome Tracing (compatible with Perfetto)"); - - return true; -} - -bool CallTree::load_from_file(const std::string& filename) { - std::ifstream file(filename, std::ios::binary); - if (!file.is_open()) { - DFTRACER_UTILS_LOG_ERROR("Cannot open file for reading: %s", - filename.c_str()); - return false; - } - - // Read and verify header - char magic[8]; - file.read(magic, 8); - - if (std::memcmp(magic, "CALLTREE", 8) != 0) { - DFTRACER_UTILS_LOG_ERROR("%s", "Invalid file format"); - return false; - } - - std::uint32_t version; - file.read(reinterpret_cast(&version), sizeof(version)); - - if (version != 1) { - DFTRACER_UTILS_LOG_ERROR("Unsupported version: %u", version); - return false; - } - - std::uint64_t num_nodes; - file.read(reinterpret_cast(&num_nodes), sizeof(num_nodes)); - - DFTRACER_UTILS_LOG_INFO("Loading %lu nodes from %s", - (unsigned long)num_nodes, filename.c_str()); - - // Note: This is a simplified load that just verifies the file - // Full reconstruction would require rebuilding the CallTree structure - - file.close(); - DFTRACER_UTILS_LOG_INFO("%s", "Call tree file validated successfully"); - - return true; -} - CallTreeStats CallTree::get_statistics() const { CallTreeStats stats; @@ -687,6 +391,11 @@ void CallTree::print_statistics() const { bool CallTree::is_generated() const { return impl_->is_generated; } +internal::CallTree& CallTree::internal_tree() { return impl_->graph; } +const internal::CallTree& CallTree::internal_tree() const { + return impl_->graph; +} + size_t CallTree::get_num_trace_files() const { return impl_->trace_files.size(); } @@ -696,7 +405,6 @@ void CallTree::clear() { impl_->graph.initialize(); impl_->trace_files.clear(); impl_->trace_directory.clear(); - impl_->output_path.clear(); impl_->is_generated = false; } @@ -758,18 +466,8 @@ std::vector CallTree::get_root_nodes( const auto& node = it->second; CallTreeNodeInfo info; - info.id = node->get_id(); - info.name = node->get_name(); - info.category = node->get_category(); - info.start_time_us = node->get_start_time(); - info.duration_us = node->get_duration(); - info.level = node->get_level(); - info.parent_id = node->get_parent_id(); - info.num_children = node->get_children().size(); - info.children_ids = node->get_children(); - info.args = node->get_args(); - - root_nodes.push_back(info); + internal::fill_node_info(*node, info); + root_nodes.push_back(std::move(info)); } } @@ -798,17 +496,7 @@ CallTreeNodeInfo CallTree::get_node_by_id(std::uint64_t id) const { const auto& node = it->second; CallTreeNodeInfo info; - info.id = node->get_id(); - info.name = node->get_name(); - info.category = node->get_category(); - info.start_time_us = node->get_start_time(); - info.duration_us = node->get_duration(); - info.level = node->get_level(); - info.parent_id = node->get_parent_id(); - info.num_children = node->get_children().size(); - info.children_ids = node->get_children(); - info.args = node->get_args(); - + internal::fill_node_info(*node, info); return info; } } diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp index a81d2c23..b8299b19 100644 --- a/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp +++ b/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp @@ -5,19 +5,17 @@ #include #include #include -#include #include +#include #include -#include -#include -#include -#include -#include +#include +#include +#include +#include #include #include -#include -#include +#include namespace dftracer::utils::call_tree { namespace internal { @@ -39,8 +37,8 @@ CallTreeNode::CallTreeNode() initialized_(false), cleaned_up_(false) {} -CallTreeNode::CallTreeNode(std::uint64_t id, const std::string& name, - const std::string& category) +CallTreeNode::CallTreeNode(std::uint64_t id, std::string_view name, + std::string_view category) : id_(id), name_(name), category_(category), @@ -57,10 +55,9 @@ CallTreeNode::~CallTreeNode() { if (!cleaned_up_) { cleanup(); } - // Clear all state id_ = 0; - name_.clear(); - category_.clear(); + name_ = {}; + category_ = {}; start_time_ = 0; duration_ = 0; level_ = 0; @@ -125,8 +122,8 @@ CallTreeNode& CallTreeNode::operator=(CallTreeNode&& other) noexcept { return *this; } -void CallTreeNode::initialize(std::uint64_t id, const std::string& name, - const std::string& category, +void CallTreeNode::initialize(std::uint64_t id, std::string_view name, + std::string_view category, std::uint64_t start_time, std::uint64_t duration, int level) { id_ = id; @@ -146,13 +143,10 @@ void CallTreeNode::cleanup() { if (cleaned_up_) { return; } - - // Clear containers to free memory args_.clear(); children_.clear(); - name_.clear(); - category_.clear(); - + name_ = {}; + category_ = {}; cleaned_up_ = true; } @@ -200,309 +194,126 @@ void CallTreeFactory::cleanup() { } std::shared_ptr CallTreeFactory::create_node( - std::uint64_t id, const std::string& name, const std::string& category, - std::uint64_t start_time, std::uint64_t duration, int level, - const std::unordered_map& args) { + std::uint64_t id, std::string_view name, std::string_view category, + std::uint64_t start_time, std::uint64_t duration, int level, ArgsMap args) { auto node = std::make_shared(id, name, category); node->initialize(id, name, category, start_time, duration, level); - node->set_args(args); - - // Track the node for cleanup + node->set_args(std::move(args)); managed_nodes_.push_back(node); node_count_++; - return node; } // ============================================================================ -// TraceLineProcessor - LineProcessor for parsing trace events +// TraceReader Implementation (delegates to utilities::reader::TraceReader) // ============================================================================ -class TraceLineProcessor - : public dftracer::utils::utilities::reader::internal::LineProcessor { - public: - TraceLineProcessor(TraceReader& reader, CallTree& graph) - : reader_(reader), - graph_(graph), - line_count_(0), - processed_(0), - report_interval_(10000) {} - - coro::CoroTask process(const char* data, - std::size_t length) override { - line_count_++; - - // Progress indicator - if (line_count_ % report_interval_ == 0) { - DFTRACER_UTILS_LOG_DEBUG(" processed %zu lines, %zu traces...", - line_count_, processed_); - } - - // Skip empty lines, brackets - if (length == 0) { - co_return true; - } +namespace { - std::string line(data, length); +using dftracer::utils::utilities::common::json::JsonParser; +using dftracer::utils::utilities::composites::dft::DFTracerEvent; - // Skip brackets - if (line == "[" || line == "]") { - co_return true; - } - - // Remove trailing comma - if (!line.empty() && line.back() == ',') { - line.pop_back(); - } - - if (reader_.process_trace_line(line, graph_)) { - processed_++; - } - - co_return true; // Continue processing - } - - void end() override { - DFTRACER_UTILS_LOG_INFO( - "processed %zu trace entries from %zu total lines", processed_, - line_count_); - } - - std::size_t get_processed_count() const { return processed_; } - - private: - TraceReader& reader_; - CallTree& graph_; - std::size_t line_count_; - std::size_t processed_; - std::size_t report_interval_; +struct ParsedEvent { + bool parsed = false; + bool filtered = false; }; -// ============================================================================ -// TraceReader Implementation -// ============================================================================ - -bool TraceReader::read(const std::string& trace_file, CallTree& graph) { - DFTRACER_UTILS_LOG_INFO("reading trace file: %s", trace_file.c_str()); - - // Try to use Reader API first (for compressed files, tar.gz, etc.) - if (read_with_reader(trace_file, graph)) { - return true; - } - - // Fallback to direct reading for plain text files - return read_direct(trace_file, graph); +dftracer::utils::StringIntern& name_intern() { + static dftracer::utils::StringIntern instance; + return instance; } -bool TraceReader::read_with_reader(const std::string& trace_file, - CallTree& graph) { - try { - // Detect file format - auto format = dftracer::utils::FormatDetector::detect(trace_file); - - // For GZIP files, skip Reader API and use direct zlib decompression - // since this path expects a prebuilt `.dftindex` store. - if (format == dftracer::utils::ArchiveFormat::GZIP) { - return false; // Will trigger fallback to read_direct which handles - // gzip - } - - // Check if format is supported by Reader - if (!dftracer::utils::utilities::reader::internal::ReaderFactory:: - is_format_supported(format)) { - // Not supported, will use fallback - return false; - } - - std::string index_path = dftracer::utils::utilities::composites::dft:: - internal::determine_index_path(trace_file, ""); - - // Create reader (this will auto-build index if needed) - auto reader = - dftracer::utils::utilities::reader::internal::ReaderFactory::create( - trace_file, index_path); - if (!reader || !reader->is_valid()) { - DFTRACER_UTILS_LOG_ERROR("Failed to create reader for %s", - trace_file.c_str()); - return false; - } - - DFTRACER_UTILS_LOG_INFO("Using Reader API for %s (format: %s)", - trace_file.c_str(), - reader->get_format_name().c_str()); - - // Create line processor - TraceLineProcessor processor(*this, graph); +ParsedEvent ingest_event(JsonParser& parser, CallTree& graph, + const std::set* allowed_pids) { + ParsedEvent out; - // Read all lines using line processor - std::size_t num_lines = reader->get_num_lines(); - if (num_lines > 0) { - reader->read_lines_with_processor(1, num_lines, processor); - } + DFTracerEvent ev; + if (!DFTracerEvent::parse_ondemand(parser, ev)) return out; + out.parsed = true; - return true; - - } catch (const std::exception& e) { - DFTRACER_UTILS_LOG_ERROR("Reader API failed for %s: %s", - trace_file.c_str(), e.what()); - return false; + if (allowed_pids && allowed_pids->find(static_cast( + ev.pid)) == allowed_pids->end()) { + out.filtered = true; + return out; } -} - -bool TraceReader::read_direct(const std::string& trace_file, CallTree& graph) { - // Detect file format to see if we need decompression - ArchiveFormat format = FormatDetector::detect(trace_file); - // Handle gzip files with zlib - if (format == ArchiveFormat::GZIP) { - DFTRACER_UTILS_LOG_INFO("Using zlib decompression for %s", - trace_file.c_str()); - - gzFile gz = gzopen(trace_file.c_str(), "rb"); - if (!gz) { - DFTRACER_UTILS_LOG_ERROR("Cannot open gzip file: %s", - trace_file.c_str()); - return false; - } + if (!ev.is_complete()) return out; - char buffer[65536]; - std::string current_line; - size_t line_count = 0; - size_t processed = 0; - size_t report_interval = 10000; - - while (true) { - int bytes_read = gzread(gz, buffer, sizeof(buffer) - 1); - if (bytes_read <= 0) { - // Process any remaining line - if (!current_line.empty()) { - line_count++; - if (!current_line.empty() && current_line != "[" && - current_line != "]") { - if (current_line.back() == ',') current_line.pop_back(); - if (process_trace_line(current_line, graph)) { - processed++; - } - } - } - break; - } - - buffer[bytes_read] = '\0'; - current_line += buffer; - - // Process complete lines - size_t pos; - while ((pos = current_line.find('\n')) != std::string::npos) { - std::string line = current_line.substr(0, pos); - current_line = current_line.substr(pos + 1); - line_count++; + int level = 0; + std::uint32_t tid = 0; + std::uint32_t node_id = 0; + if (auto p = ev.args["level"]) + level = static_cast(p.get()); + if (auto p = ev.args["tid"]) + tid = static_cast(p.get()); + if (auto p = ev.args["node_id"]) + node_id = static_cast(p.get()); - if (line_count % report_interval == 0) { - DFTRACER_UTILS_LOG_DEBUG( - " processed %zu lines, %zu traces...", line_count, - processed); - } + auto name_sv = name_intern().intern(ev.name); + auto cat_sv = name_intern().intern(ev.cat); - if (line.empty() || line == "[" || line == "]") continue; - if (!line.empty() && line.back() == ',') line.pop_back(); + ProcessKey key(static_cast(ev.pid), tid, node_id); + auto call = graph.get_factory().create_node( + ev.id, name_sv, cat_sv, ev.ts, ev.dur, level, std::move(ev.args)); + graph.add_call(key, call); + return out; +} - if (process_trace_line(line, graph)) { - processed++; - } - } - } +} // namespace - gzclose(gz); - DFTRACER_UTILS_LOG_INFO("processed %zu trace entries from %zu lines", - processed, line_count); - return true; - } +coro::CoroTask read_trace_file_async( + std::string trace_file, CallTree* graph, + const std::set* allowed_pids) { + using dftracer::utils::utilities::reader::ReadConfig; + using dftracer::utils::utilities::reader::TraceReader; + using dftracer::utils::utilities::reader::TraceReaderConfig; - // Handle tar.gz - not supported without indexer - if (format == ArchiveFormat::TAR_GZ) { - DFTRACER_UTILS_LOG_ERROR("Cannot read tar.gz file without index: %s", - trace_file.c_str()); - DFTRACER_UTILS_LOG_ERROR("%s", - "Please create an index using dftracer_map"); - return false; - } + ReadCounts counts; - // Plain text file - DFTRACER_UTILS_LOG_INFO("Using direct file reading for %s", - trace_file.c_str()); + TraceReaderConfig cfg; + cfg.file_path = trace_file; + cfg.auto_build_index = true; + TraceReader reader(std::move(cfg)); - std::ifstream file(trace_file); - if (!file.is_open()) { - DFTRACER_UTILS_LOG_ERROR("cant open trace file: %s", - trace_file.c_str()); - return false; + auto gen = reader.read_json(ReadConfig{}); + while (auto opt = co_await gen.next()) { + auto res = ingest_event(*opt->parser, *graph, allowed_pids); + if (res.filtered) + counts.filtered++; + else if (res.parsed) + counts.processed++; } - std::string line; - size_t line_count = 0; - size_t processed = 0; - size_t report_interval = 10000; - - while (std::getline(file, line)) { - line_count++; - - // progress indicator - if (line_count % report_interval == 0) { - DFTRACER_UTILS_LOG_DEBUG(" processed %zu lines, %zu traces...", - line_count, processed); - } - - // skip brackets and empty lines - if (line.empty() || line == "[" || line == "]") { - continue; - } - - // remove trailing comma - if (!line.empty() && line.back() == ',') { - line.pop_back(); - } - - if (process_trace_line(line, graph)) { - processed++; - } else { - // Don't spam errors for metadata entries - if (line_count < 10) { - DFTRACER_UTILS_LOG_ERROR("failed to parse line %zu in %s", - line_count, trace_file.c_str()); - } - } - } + co_return counts; +} - DFTRACER_UTILS_LOG_INFO("processed %zu trace entries from %s", processed, - trace_file.c_str()); +ReadCounts read_trace_file(const std::string& trace_file, CallTree& graph, + const std::set* allowed_pids) { + return read_trace_file_async(trace_file, &graph, allowed_pids).get(); +} +bool TraceReader::read(const std::string& trace_file, CallTree& graph) { + DFTRACER_UTILS_LOG_INFO("reading trace file: %s", trace_file.c_str()); + auto counts = read_trace_file(trace_file, graph, nullptr); + DFTRACER_UTILS_LOG_INFO("processed %zu trace entries from %s", + counts.processed, trace_file.c_str()); return true; } bool TraceReader::read_multiple(const std::vector& trace_files, CallTree& graph) { - bool all_success = true; - DFTRACER_UTILS_LOG_INFO("reading %zu trace files...", trace_files.size()); - - size_t file_num = 0; - (void)file_num; + bool all_success = true; for (const auto& file : trace_files) { - file_num++; - DFTRACER_UTILS_LOG_DEBUG("[%zu/%zu] ", file_num, trace_files.size()); if (!read(file, graph)) { DFTRACER_UTILS_LOG_ERROR("failed to read: %s", file.c_str()); all_success = false; } } - - // build parent child relationships after all traces loaded DFTRACER_UTILS_LOG_INFO( "building call hierarchy for %zu process/thread/node combinations...", graph.size()); graph.build_hierarchy(); - return all_success; } @@ -515,17 +326,12 @@ bool TraceReader::read_directory(const std::string& directory, } std::vector trace_files; - - // collect all matching files for (const auto& entry : fs::directory_iterator(directory)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - - // simple pattern matching (for now, just check file extension) - if (pattern == "*" || - filename.find(pattern.substr(1)) != std::string::npos) { - trace_files.push_back(entry.path().string()); - } + if (!entry.is_regular_file()) continue; + std::string filename = entry.path().filename().string(); + if (pattern == "*" || + filename.find(pattern.substr(1)) != std::string::npos) { + trace_files.push_back(entry.path().string()); } } @@ -535,108 +341,21 @@ bool TraceReader::read_directory(const std::string& directory, return false; } - // sort files for consistent processing order std::sort(trace_files.begin(), trace_files.end()); - DFTRACER_UTILS_LOG_INFO("found %zu trace files in %s", trace_files.size(), directory.c_str()); - return read_multiple(trace_files, graph); } -bool TraceReader::process_trace_line(const std::string& line, CallTree& graph) { - yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0); - if (!doc) { - return false; - } - - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root) { - yyjson_doc_free(doc); - return false; - } - - // get basic fields - yyjson_val* id_val = yyjson_obj_get(root, "id"); - yyjson_val* name_val = yyjson_obj_get(root, "name"); - yyjson_val* cat_val = yyjson_obj_get(root, "cat"); - yyjson_val* pid_val = yyjson_obj_get(root, "pid"); - yyjson_val* ph_val = yyjson_obj_get(root, "ph"); - yyjson_val* ts_val = yyjson_obj_get(root, "ts"); - yyjson_val* dur_val = yyjson_obj_get(root, "dur"); - yyjson_val* args_val = yyjson_obj_get(root, "args"); - - // skip metadata entries - if (!ph_val || !yyjson_is_str(ph_val) || - strcmp(yyjson_get_str(ph_val), "X") != 0) { - yyjson_doc_free(doc); - return true; // not an error just skip - } - - if (!id_val || !name_val || !pid_val || !ts_val) { - yyjson_doc_free(doc); - return false; - } - - std::uint64_t call_id = yyjson_get_uint(id_val); - std::uint64_t pid = yyjson_get_uint(pid_val); - std::string name = yyjson_get_str(name_val); - std::string category = cat_val ? yyjson_get_str(cat_val) : ""; - std::uint64_t start_time = yyjson_get_uint(ts_val); - std::uint64_t duration = dur_val ? yyjson_get_uint(dur_val) : 0; - - // get level, tid, and node_id from args - int level = 0; - std::uint32_t tid = 0; - std::uint32_t node_id = 0; - - // Collect all args - std::unordered_map args; - - if (args_val && yyjson_is_obj(args_val)) { - yyjson_val* level_val = yyjson_obj_get(args_val, "level"); - if (level_val) { - level = yyjson_get_int(level_val); - } - - yyjson_val* tid_val = yyjson_obj_get(args_val, "tid"); - if (tid_val) { - tid = static_cast(yyjson_get_uint(tid_val)); - } - - yyjson_val* node_val = yyjson_obj_get(args_val, "node_id"); - if (node_val) { - node_id = static_cast(yyjson_get_uint(node_val)); - } - - // Store all args - yyjson_obj_iter iter; - yyjson_obj_iter_init(args_val, &iter); - yyjson_val *arg_key, *arg_val; - while ((arg_key = yyjson_obj_iter_next(&iter))) { - arg_val = yyjson_obj_iter_get_val(arg_key); - if (yyjson_is_str(arg_val)) { - args[yyjson_get_str(arg_key)] = yyjson_get_str(arg_val); - } else if (yyjson_is_int(arg_val)) { - args[yyjson_get_str(arg_key)] = - std::to_string(yyjson_get_int(arg_val)); - } else if (yyjson_is_uint(arg_val)) { - args[yyjson_get_str(arg_key)] = - std::to_string(yyjson_get_uint(arg_val)); - } - } - } - - // Create function call using factory - ProcessKey key(static_cast(pid), tid, node_id); - auto call = graph.get_factory().create_node( - call_id, name, category, start_time, duration, level, args); - - // Add call to graph - graph.add_call(key, call); +bool TraceReader::process_trace_line(JsonParser& parser, CallTree& graph) { + auto res = ingest_event(parser, graph, nullptr); + return res.parsed; +} - yyjson_doc_free(doc); - return true; +bool TraceReader::process_trace_line(const std::string& line, CallTree& graph) { + JsonParser parser; + if (!parser.parse(line)) return false; + return process_trace_line(parser, graph); } // ============================================================================ @@ -705,6 +424,25 @@ bool CallTree::load(const std::string& trace_file) { return reader.read(trace_file, *this); } +void CallTree::merge_from(CallTree&& other) { + for (auto& [key, src_graph] : other.process_graphs_) { + if (!src_graph) continue; + auto it = process_graphs_.find(key); + if (it == process_graphs_.end()) { + process_graphs_.emplace(key, std::move(src_graph)); + } else { + auto& dst = *it->second; + for (auto& [id, node] : src_graph->calls) { + dst.calls[id] = std::move(node); + } + dst.call_sequence.insert(dst.call_sequence.end(), + src_graph->call_sequence.begin(), + src_graph->call_sequence.end()); + } + } + other.process_graphs_.clear(); +} + void CallTree::add_call(const ProcessKey& key, std::shared_ptr call) { // make sure process graph exists @@ -755,43 +493,57 @@ void CallTree::build_hierarchy_internal(ProcessCallTree* graph) { sorted_calls.push_back(call); } - // sort by start time to build hierarchy std::sort(sorted_calls.begin(), sorted_calls.end(), [](const auto& a, const auto& b) { - return a->get_start_time() < b->get_start_time(); + std::uint64_t a_start = a->get_start_time(); + std::uint64_t b_start = b->get_start_time(); + if (a_start != b_start) return a_start < b_start; + std::uint64_t a_end = a_start + a->get_duration(); + std::uint64_t b_end = b_start + b->get_duration(); + if (a_end != b_end) return a_end > b_end; + return a->get_level() < b->get_level(); }); - // find parents for each call + struct OpenEntry { + std::uint64_t end_time; + std::uint64_t id; + }; + std::vector> open_by_level; + for (auto& call : sorted_calls) { - bool found_parent = false; - - // look for parent that contains this call - for (auto& potential_parent : sorted_calls) { - if (potential_parent->get_id() == call->get_id()) continue; - - std::uint64_t parent_end = potential_parent->get_start_time() + - potential_parent->get_duration(); - - // check if call is inside parent timespan and level is correct - if (call->get_start_time() >= potential_parent->get_start_time() && - (call->get_start_time() + call->get_duration()) <= parent_end && - call->get_level() > potential_parent->get_level()) { - // find closest parent by level - if (!found_parent || - potential_parent->get_level() > - graph->calls[call->get_parent_id()]->get_level()) { - call->set_parent_id(potential_parent->get_id()); - found_parent = true; + const std::uint64_t call_start = call->get_start_time(); + const std::uint64_t call_end = call_start + call->get_duration(); + const int call_level = call->get_level(); + + std::uint64_t parent_id = 0; + int probe_max = + std::min(call_level, static_cast(open_by_level.size())) - + 1; + for (int lvl = probe_max; lvl >= 0; --lvl) { + auto& stack = open_by_level[lvl]; + while (!stack.empty() && stack.back().end_time < call_start) { + stack.pop_back(); + } + for (auto sit = stack.rbegin(); sit != stack.rend(); ++sit) { + if (sit->end_time >= call_end) { + parent_id = sit->id; + break; } } + if (parent_id != 0) break; } - // add to parent children or root - if (found_parent) { - graph->calls[call->get_parent_id()]->add_child(call->get_id()); + if (parent_id != 0) { + call->set_parent_id(parent_id); + graph->calls[parent_id]->add_child(call->get_id()); } else { graph->root_calls.push_back(call->get_id()); } + + if (call_level >= static_cast(open_by_level.size())) { + open_by_level.resize(call_level + 1); + } + open_by_level[call_level].push_back({call_end, call->get_id()}); } } @@ -870,9 +622,11 @@ void CallTree::print_calls_recursive(const ProcessCallTree& graph, } // print call info - printf("%s [%s] level=%d dur=%luus ts=%lu\n", call->get_name().c_str(), - call->get_category().c_str(), call->get_level(), - (unsigned long)call->get_duration(), + auto nm = call->get_name(); + auto ct = call->get_category(); + printf("%.*s [%.*s] level=%d dur=%luus ts=%lu\n", + static_cast(nm.size()), nm.data(), static_cast(ct.size()), + ct.data(), call->get_level(), (unsigned long)call->get_duration(), (unsigned long)call->get_start_time()); // print children diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp index 8a6fe6de..dd9fe8cc 100644 --- a/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp +++ b/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp @@ -2,1188 +2,493 @@ #include #include #include -#include +#include #include -#include -#include -#include -#include -#include -#include +#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include -#include +#include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include namespace dftracer::utils::call_tree { -// ============================================================================ -// Serialization Utilities -// ============================================================================ - -namespace serialization { - -void write_uint32(std::vector& buffer, std::uint32_t value) { - buffer.insert(buffer.end(), reinterpret_cast(&value), - reinterpret_cast(&value) + sizeof(value)); -} - -void write_uint64(std::vector& buffer, std::uint64_t value) { - buffer.insert(buffer.end(), reinterpret_cast(&value), - reinterpret_cast(&value) + sizeof(value)); -} - -void write_int(std::vector& buffer, int value) { - buffer.insert(buffer.end(), reinterpret_cast(&value), - reinterpret_cast(&value) + sizeof(value)); -} - -void write_string(std::vector& buffer, const std::string& str) { - std::uint32_t len = static_cast(str.size()); - write_uint32(buffer, len); - buffer.insert(buffer.end(), str.begin(), str.end()); -} - -std::uint32_t read_uint32(const char* data, size_t& offset) { - std::uint32_t value; - std::memcpy(&value, data + offset, sizeof(value)); - offset += sizeof(value); - return value; -} - -std::uint64_t read_uint64(const char* data, size_t& offset) { - std::uint64_t value; - std::memcpy(&value, data + offset, sizeof(value)); - offset += sizeof(value); - return value; -} - -int read_int(const char* data, size_t& offset) { - int value; - std::memcpy(&value, data + offset, sizeof(value)); - offset += sizeof(value); - return value; -} - -std::string read_string(const char* data, size_t& offset) { - std::uint32_t len = read_uint32(data, offset); - std::string str(data + offset, len); - offset += len; - return str; -} - -} // namespace serialization - -// ============================================================================ -// SerializableCallNode Implementation -// ============================================================================ - -std::vector SerializableCallNode::serialize() const { - std::vector buffer; - - serialization::write_uint64(buffer, id); - serialization::write_string(buffer, name); - serialization::write_string(buffer, category); - serialization::write_uint64(buffer, start_time); - serialization::write_uint64(buffer, duration); - serialization::write_int(buffer, level); - serialization::write_uint64(buffer, parent_id); - - // Children - serialization::write_uint32(buffer, - static_cast(children.size())); - for (auto child_id : children) { - serialization::write_uint64(buffer, child_id); - } - - // Args - serialization::write_uint32(buffer, - static_cast(args.size())); - for (const auto& [key, value] : args) { - serialization::write_string(buffer, key); - serialization::write_string(buffer, value); - } - - return buffer; -} - -SerializableCallNode SerializableCallNode::deserialize(const char* data, - size_t& offset) { - SerializableCallNode node; - - node.id = serialization::read_uint64(data, offset); - node.name = serialization::read_string(data, offset); - node.category = serialization::read_string(data, offset); - node.start_time = serialization::read_uint64(data, offset); - node.duration = serialization::read_uint64(data, offset); - node.level = serialization::read_int(data, offset); - node.parent_id = serialization::read_uint64(data, offset); - - // Children - std::uint32_t num_children = serialization::read_uint32(data, offset); - node.children.reserve(num_children); - for (std::uint32_t i = 0; i < num_children; i++) { - node.children.push_back(serialization::read_uint64(data, offset)); - } - - // Args - std::uint32_t num_args = serialization::read_uint32(data, offset); - for (std::uint32_t i = 0; i < num_args; i++) { - std::string key = serialization::read_string(data, offset); - std::string value = serialization::read_string(data, offset); - node.args[key] = value; - } - - return node; -} - -// ============================================================================ -// SerializableProcessGraph Implementation -// ============================================================================ - -std::vector SerializableProcessGraph::serialize() const { - std::vector buffer; - - // Key - serialization::write_uint32(buffer, key.pid); - serialization::write_uint32(buffer, key.tid); - serialization::write_uint32(buffer, key.node_id); - - // Nodes - serialization::write_uint32(buffer, - static_cast(nodes.size())); - for (const auto& node : nodes) { - auto node_data = node.serialize(); - serialization::write_uint32( - buffer, static_cast(node_data.size())); - buffer.insert(buffer.end(), node_data.begin(), node_data.end()); - } - - // Root calls - serialization::write_uint32(buffer, - static_cast(root_calls.size())); - for (auto id : root_calls) { - serialization::write_uint64(buffer, id); - } - - // Call sequence - serialization::write_uint32( - buffer, static_cast(call_sequence.size())); - for (auto id : call_sequence) { - serialization::write_uint64(buffer, id); - } - - return buffer; -} - -SerializableProcessGraph SerializableProcessGraph::deserialize(const char* data, - size_t& offset) { - SerializableProcessGraph graph; - - // Key - graph.key.pid = serialization::read_uint32(data, offset); - graph.key.tid = serialization::read_uint32(data, offset); - graph.key.node_id = serialization::read_uint32(data, offset); - - // Nodes - std::uint32_t num_nodes = serialization::read_uint32(data, offset); - graph.nodes.reserve(num_nodes); - for (std::uint32_t i = 0; i < num_nodes; i++) { - std::uint32_t node_size = serialization::read_uint32(data, offset); - (void)node_size; // Not needed for deserialization - graph.nodes.push_back(SerializableCallNode::deserialize(data, offset)); - } - - // Root calls - std::uint32_t num_roots = serialization::read_uint32(data, offset); - graph.root_calls.reserve(num_roots); - for (std::uint32_t i = 0; i < num_roots; i++) { - graph.root_calls.push_back(serialization::read_uint64(data, offset)); - } - - // Call sequence - std::uint32_t num_seq = serialization::read_uint32(data, offset); - graph.call_sequence.reserve(num_seq); - for (std::uint32_t i = 0; i < num_seq; i++) { - graph.call_sequence.push_back(serialization::read_uint64(data, offset)); - } - - return graph; -} - -// ============================================================================ -// MPIFilteredTraceReader Implementation -// ============================================================================ - -MPIFilteredTraceReader::MPIFilteredTraceReader( - const std::set& allowed_pids) - : allowed_pids_(allowed_pids), processed_count_(0), filtered_count_(0) {} - -bool MPIFilteredTraceReader::read(const std::string& trace_file, - internal::CallTree& graph) { - // Check if it's a gzip file - ArchiveFormat format = FormatDetector::detect(trace_file); - - if (format == ArchiveFormat::GZIP) { - std::string index_path = - utilities::composites::dft::internal::determine_index_path( - trace_file, ""); - if (fs::exists(index_path)) { - return read_with_indexer(trace_file, index_path, graph); - } - } - - // Fall back to direct reading for plain text files - std::ifstream file(trace_file); - if (!file.is_open()) { - DFTRACER_UTILS_LOG_ERROR("Cannot open trace file: %s", - trace_file.c_str()); - return false; - } - - std::string line; - size_t line_count = 0; - - while (std::getline(file, line)) { - line_count++; - - // Skip brackets and empty lines - if (line.empty() || line == "[" || line == "]") { - continue; - } - - // Remove trailing comma - if (!line.empty() && line.back() == ',') { - line.pop_back(); - } - - yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0); - if (!doc) { - continue; - } - - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root) { - yyjson_doc_free(doc); - continue; - } - - // Check PID filter - yyjson_val* pid_val = yyjson_obj_get(root, "pid"); - if (pid_val) { - std::uint32_t pid = - static_cast(yyjson_get_uint(pid_val)); - - // Only process if PID is in our allowed set - if (allowed_pids_.find(pid) != allowed_pids_.end()) { - // Use the standard internal::TraceReader processing - internal::TraceReader reader; - if (reader.process_trace_line(line, graph)) { - processed_count_++; - } - } else { - filtered_count_++; - } - } - - yyjson_doc_free(doc); - } - - return true; -} - -/** - * Line processor for filtered reading with indexer - */ -class FilteredLineProcessor - : public utilities::reader::internal::LineProcessor { - public: - FilteredLineProcessor(const std::set& allowed_pids, - internal::CallTree& graph, - std::size_t& processed_count, - std::size_t& filtered_count) - : allowed_pids_(allowed_pids), - graph_(graph), - processed_count_(processed_count), - filtered_count_(filtered_count), - reader_() {} - - coro::CoroTask process(const char* data, - std::size_t length) override { - if (length == 0) { - co_return true; - } - - std::string line(data, length); - - // Skip brackets - if (line == "[" || line == "]") { - co_return true; - } - - // Remove trailing comma - if (!line.empty() && line.back() == ',') { - line.pop_back(); - } - - // Quick PID check - yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0); - if (!doc) { - co_return true; - } - - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root) { - yyjson_doc_free(doc); - co_return true; - } - - yyjson_val* pid_val = yyjson_obj_get(root, "pid"); - if (pid_val) { - std::uint32_t pid = - static_cast(yyjson_get_uint(pid_val)); - - if (allowed_pids_.find(pid) != allowed_pids_.end()) { - if (reader_.process_trace_line(line, graph_)) { - processed_count_++; - } - } else { - filtered_count_++; +namespace { + +bool is_trace_file(const std::string& path) { + return (path.size() >= 4 && + path.compare(path.size() - 4, 4, ".pfw") == 0) || + (path.size() >= 7 && + path.compare(path.size() - 7, 7, ".pfw.gz") == 0); +} + +coro::CoroTask scan_file_pids(std::string path, + std::set* out); +coro::CoroTask ingest_file(std::string path, internal::CallTree* tree, + const std::set* pids, + std::atomic* total); +coro::CoroTask build_hierarchy_one(internal::CallTree* tree, + internal::ProcessKey key); +coro::CoroTask serialize_one(const internal::CallTree* tree, + internal::ProcessKey key, + const std::string* hostname_hash, + std::vector* slice_buffers, + std::size_t index, std::uint64_t start_idx); + +coro::CoroTask scan_files_into( + CoroScope* child, const std::vector* paths, + std::vector>* per_file); +coro::CoroTask ingest_files_into( + CoroScope* child, const std::vector* paths, + const std::vector>* per_file, + const std::set* pids, std::atomic* total); +coro::CoroTask hierarchy_all( + CoroScope* child, internal::CallTree* tree, + const std::vector* keys); +coro::CoroTask serialize_all( + CoroScope* child, const internal::CallTree* tree, + const std::vector* keys, + const std::string* hostname_hash, std::vector* slice_buffers, + std::uint64_t rank_base, std::uint64_t stride); + +coro::CoroTask scan_file_pids(std::string path, + std::set* out) { + using utilities::reader::ReadConfig; + using utilities::reader::TraceReader; + using utilities::reader::TraceReaderConfig; + TraceReaderConfig cfg; + cfg.file_path = std::move(path); + cfg.auto_build_index = true; + TraceReader reader(std::move(cfg)); + auto gen = reader.read_json(ReadConfig{}); + while (auto opt = co_await gen.next()) { + auto pid = opt->parser->get_uint64("pid"); + if (pid) out->insert(static_cast(*pid)); + } +} + +coro::CoroTask scan_files_into( + CoroScope* child, const std::vector* paths, + std::vector>* per_file) { + for (std::size_t k = 0; k < paths->size(); ++k) { + std::string path = (*paths)[k]; + std::set* out = &(*per_file)[k]; + child->spawn([path = std::move(path), + out](CoroScope&) mutable -> coro::CoroTask { + co_await scan_file_pids(std::move(path), out); + }); + } + co_return; +} + +coro::CoroTask ingest_files_into( + CoroScope* child, const std::vector* paths, + const std::vector>* per_file, + const std::set* pids, std::atomic* total) { + for (std::size_t i = 0; i < paths->size(); ++i) { + std::string path = (*paths)[i]; + internal::CallTree* tree = (*per_file)[i].get(); + child->spawn([path = std::move(path), tree, pids, + total](CoroScope&) mutable -> coro::CoroTask { + co_await ingest_file(std::move(path), tree, pids, total); + }); + } + co_return; +} + +coro::CoroTask hierarchy_all( + CoroScope* child, internal::CallTree* tree, + const std::vector* keys) { + for (auto k : *keys) { + child->spawn([tree, k](CoroScope&) mutable -> coro::CoroTask { + co_await build_hierarchy_one(tree, k); + }); + } + co_return; +} + +coro::CoroTask serialize_all( + CoroScope* child, const internal::CallTree* tree, + const std::vector* keys, + const std::string* hostname_hash, std::vector* slice_buffers, + std::uint64_t rank_base, std::uint64_t stride) { + for (std::size_t i = 0; i < keys->size(); ++i) { + internal::ProcessKey k = (*keys)[i]; + std::uint64_t start_idx = rank_base + i * stride; + child->spawn([tree, k, start_idx, i, hostname_hash, slice_buffers]( + CoroScope&) mutable -> coro::CoroTask { + co_await serialize_one(tree, k, hostname_hash, slice_buffers, i, + start_idx); + }); + } + co_return; +} + +coro::CoroTask ingest_file(std::string path, internal::CallTree* tree, + const std::set* pids, + std::atomic* total) { + auto counts = + co_await internal::read_trace_file_async(std::move(path), tree, pids); + total->fetch_add(counts.processed, std::memory_order_relaxed); +} + +coro::CoroTask build_hierarchy_one(internal::CallTree* tree, + internal::ProcessKey key) { + tree->build_hierarchy_for_process(key); + co_return; +} + +void serialize_process_dfs(const internal::ProcessCallTree& pgraph, + const internal::ProcessKey& key, + internal::JsonSerializer& serializer, + std::uint64_t start_idx, std::string& out) { + char buffer[16384]; + std::uint64_t idx = start_idx; + for (std::uint64_t root_id : pgraph.root_calls) { + std::vector stack; + stack.push_back(root_id); + while (!stack.empty()) { + std::uint64_t nid = stack.back(); + stack.pop_back(); + auto it = pgraph.calls.find(nid); + if (it == pgraph.calls.end()) continue; + const auto& node = it->second; + std::size_t w = serializer.serialize_node( + buffer, static_cast(idx++), *node, key.pid, key.tid); + if (w > 0) { + out.append(buffer, w - 1); + out.append(",\n", 2); } + const auto& children = node->get_children(); + for (auto cit = children.rbegin(); cit != children.rend(); ++cit) + stack.push_back(*cit); } - - yyjson_doc_free(doc); - co_return true; - } - - private: - const std::set& allowed_pids_; - internal::CallTree& graph_; - std::size_t& processed_count_; - std::size_t& filtered_count_; - internal::TraceReader reader_; -}; - -bool MPIFilteredTraceReader::read_with_indexer(const std::string& trace_file, - const std::string& index_file, - internal::CallTree& graph) { - try { - auto reader = utilities::reader::internal::ReaderFactory::create( - trace_file, index_file); - if (!reader || !reader->is_valid()) { - DFTRACER_UTILS_LOG_ERROR("Failed to create reader for %s", - trace_file.c_str()); - return read(trace_file, graph); // Fallback - } - - FilteredLineProcessor processor(allowed_pids_, graph, processed_count_, - filtered_count_); - - std::size_t num_lines = reader->get_num_lines(); - if (num_lines > 0) { - reader->read_lines_with_processor(1, num_lines, processor); - } - - return true; - } catch (const std::exception& e) { - DFTRACER_UTILS_LOG_ERROR("Error reading with indexer: %s", e.what()); - return false; } } -bool MPIFilteredTraceReader::read_multiple( - const std::vector& trace_files, internal::CallTree& graph) { - for (const auto& file : trace_files) { - if (!read(file, graph)) { - return false; - } +coro::CoroTask serialize_one(const internal::CallTree* tree, + internal::ProcessKey key, + const std::string* hostname_hash, + std::vector* slice_buffers, + std::size_t index, std::uint64_t start_idx) { + auto* pgraph = const_cast(tree)->get(key); + if (pgraph) { + internal::JsonSerializer serializer; + char init[8]; + serializer.initialize(init, *hostname_hash); + (void)init; + serialize_process_dfs(*pgraph, key, serializer, start_idx, + (*slice_buffers)[index]); } - return true; + co_return; } -// ============================================================================ -// MPICallTreeBuilder Implementation -// ============================================================================ +} // namespace MPICallTreeBuilder::MPICallTreeBuilder(const MPICallTreeConfig& config) - : config_(config), - call_tree_(std::make_unique()), - trace_files_(), - indexers_(), - pid_index_map_(), - assigned_pids_(), - all_pids_(), - initialized_(false), - pids_discovered_(false), - graphs_built_(false), - graphs_gathered_(false) {} - -MPICallTreeBuilder::~MPICallTreeBuilder() { - if (initialized_) { - cleanup(); - } -} - -MPICallTreeBuilder::MPICallTreeBuilder(MPICallTreeBuilder&& other) noexcept - : config_(std::move(other.config_)), - call_tree_(std::move(other.call_tree_)), - trace_files_(std::move(other.trace_files_)), - indexers_(std::move(other.indexers_)), - pid_index_map_(std::move(other.pid_index_map_)), - assigned_pids_(std::move(other.assigned_pids_)), - all_pids_(std::move(other.all_pids_)), - initialized_(other.initialized_), - pids_discovered_(other.pids_discovered_), - graphs_built_(other.graphs_built_), - graphs_gathered_(other.graphs_gathered_) { - other.initialized_ = false; -} - -MPICallTreeBuilder& MPICallTreeBuilder::operator=( - MPICallTreeBuilder&& other) noexcept { - if (this != &other) { - if (initialized_) { - cleanup(); - } - config_ = std::move(other.config_); - call_tree_ = std::move(other.call_tree_); - trace_files_ = std::move(other.trace_files_); - indexers_ = std::move(other.indexers_); - pid_index_map_ = std::move(other.pid_index_map_); - assigned_pids_ = std::move(other.assigned_pids_); - all_pids_ = std::move(other.all_pids_); - initialized_ = other.initialized_; - pids_discovered_ = other.pids_discovered_; - graphs_built_ = other.graphs_built_; - graphs_gathered_ = other.graphs_gathered_; - other.initialized_ = false; - } - return *this; -} - -void MPICallTreeBuilder::initialize() { - if (initialized_) { - return; - } - - // Initialize MPI utilities singleton - mpi::MPIUtils::instance().initialize(); - + : config_(config), call_tree_(std::make_unique()) { + MPI_Comm_rank(MPI_COMM_WORLD, &rank_); + MPI_Comm_size(MPI_COMM_WORLD, &world_size_); call_tree_->initialize(); - initialized_ = true; - - if (mpi::MPIUtils::instance().is_root() && config_.verbose) { - DFTRACER_UTILS_LOG_INFO( - "MPICallTreeBuilder initialized with %d MPI ranks", - mpi::MPIUtils::instance().get_world_size()); - } } -void MPICallTreeBuilder::cleanup() { - if (!initialized_) { - return; - } - - call_tree_->cleanup(); - indexers_.clear(); - trace_files_.clear(); - pid_index_map_.clear(); - assigned_pids_.clear(); - all_pids_.clear(); +MPICallTreeBuilder::~MPICallTreeBuilder() = default; - initialized_ = false; - pids_discovered_ = false; - graphs_built_ = false; - graphs_gathered_ = false; -} +MPICallTreeBuilder::MPICallTreeBuilder(MPICallTreeBuilder&&) noexcept = default; +MPICallTreeBuilder& MPICallTreeBuilder::operator=( + MPICallTreeBuilder&&) noexcept = default; void MPICallTreeBuilder::add_trace_files( const std::vector& files) { - for (const auto& file : files) { - if (fs::exists(file) && fs::is_regular_file(file)) { - trace_files_.push_back(file); - } else if (mpi::MPIUtils::instance().is_root()) { - DFTRACER_UTILS_LOG_WARN("File not found: %s", file.c_str()); - } - } -} - -void MPICallTreeBuilder::add_trace_directory(const std::string& directory, - const std::string& pattern) { - if (!fs::exists(directory) || !fs::is_directory(directory)) { - if (mpi::MPIUtils::instance().is_root()) { - DFTRACER_UTILS_LOG_ERROR("Directory not found: %s", - directory.c_str()); - } - return; - } - - // Recursively find all matching files - for (const auto& entry : fs::recursive_directory_iterator(directory)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - - // Simple pattern matching for *.ext or *.part1.part2 patterns - bool matches = false; - if (pattern == "*") { - matches = true; - } else if (pattern.front() == '*') { - // *.ext or *.pfw.gz pattern - std::string suffix = pattern.substr(1); // Remove the leading * - matches = (filename.size() >= suffix.size() && - filename.substr(filename.size() - suffix.size()) == - suffix); - } else { - matches = (filename.find(pattern) != std::string::npos); - } - - if (matches) { - trace_files_.push_back(entry.path().string()); - } - } - } - + trace_files_.insert(trace_files_.end(), files.begin(), files.end()); std::sort(trace_files_.begin(), trace_files_.end()); - - if (mpi::MPIUtils::instance().is_root() && config_.verbose) { - DFTRACER_UTILS_LOG_INFO("Found %zu trace files in %s", - trace_files_.size(), directory.c_str()); - } } -void MPICallTreeBuilder::create_indexer(const std::string& trace_file) { - if (indexers_.find(trace_file) != indexers_.end()) { - return; - } - - ArchiveFormat format = FormatDetector::detect(trace_file); - if (format != ArchiveFormat::GZIP) { - return; // Only create indexers for gzip files - } - - std::string idx_file = trace_file + ".zindex"; - std::uint64_t ckpt_size = - config_.checkpoint_size > 0 - ? config_.checkpoint_size - : utilities::indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE; - - try { - auto indexer = utilities::indexer::internal::IndexerFactory::create( - trace_file, idx_file, ckpt_size, false); - if (indexer) { - // Build index if needed - if (indexer->need_rebuild()) { - if (mpi::MPIUtils::instance().is_root() && config_.verbose) { - DFTRACER_UTILS_LOG_INFO("Building index for %s", - trace_file.c_str()); - } - indexer->build(); - } - indexers_[trace_file] = std::move(indexer); - } - } catch (const std::exception& e) { - if (config_.verbose) { - DFTRACER_UTILS_LOG_WARN("Could not create indexer for %s: %s", - trace_file.c_str(), e.what()); - } +void MPICallTreeBuilder::add_trace_directory(const std::string& directory, + const std::string& /*pattern*/) { + std::vector files; + std::error_code ec; + for (const auto& entry : fs::directory_iterator(directory, ec)) { + if (ec) break; + if (!entry.is_regular_file(ec)) continue; + if (is_trace_file(entry.path().string())) + files.push_back(entry.path().string()); } + add_trace_files(files); } -std::set MPICallTreeBuilder::scan_file_for_pids( - const std::string& trace_file) { - std::set pids; - - // Check if it's a gzip file with an index - ArchiveFormat format = FormatDetector::detect(trace_file); - std::string index_path = - utilities::composites::dft::internal::determine_index_path(trace_file, - ""); - - if (format == ArchiveFormat::GZIP && fs::exists(index_path)) { - try { - auto reader = utilities::reader::internal::ReaderFactory::create( - trace_file, index_path); - if (reader && reader->is_valid()) { - // Read first N lines to discover PIDs - std::size_t num_lines = reader->get_num_lines(); - std::string content = reader->read_lines( - 1, std::min(num_lines, (std::size_t)100000)); - - std::istringstream iss(content); - std::string line; - while (std::getline(iss, line)) { - if (line.empty() || line == "[" || line == "]") continue; - if (!line.empty() && line.back() == ',') line.pop_back(); - - yyjson_doc* doc = - yyjson_read(line.c_str(), line.length(), 0); - if (doc) { - yyjson_val* root = yyjson_doc_get_root(doc); - if (root) { - yyjson_val* pid_val = yyjson_obj_get(root, "pid"); - if (pid_val) { - pids.insert(static_cast( - yyjson_get_uint(pid_val))); - } - } - yyjson_doc_free(doc); - } - } - - return pids; - } - } catch (const std::exception& e) { - // Fall through to direct reading - } - } - - // For gzip files without index, use gzopen - if (format == ArchiveFormat::GZIP) { - gzFile gz = gzopen(trace_file.c_str(), "rb"); - if (!gz) { - return pids; - } - - char buffer[65536]; - std::string current_line; - int line_count = 0; - - while (line_count < 100000) { - int bytes_read = gzread(gz, buffer, sizeof(buffer) - 1); - if (bytes_read <= 0) break; - buffer[bytes_read] = '\0'; - - current_line += buffer; +namespace { - // Process complete lines - size_t pos; - while ((pos = current_line.find('\n')) != std::string::npos) { - std::string line = current_line.substr(0, pos); - current_line = current_line.substr(pos + 1); - line_count++; - - if (line.empty() || line == "[" || line == "]") continue; - if (!line.empty() && line.back() == ',') line.pop_back(); - - yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0); - if (doc) { - yyjson_val* root = yyjson_doc_get_root(doc); - if (root) { - yyjson_val* pid_val = yyjson_obj_get(root, "pid"); - if (pid_val) { - pids.insert(static_cast( - yyjson_get_uint(pid_val))); - } - } - yyjson_doc_free(doc); - } - - if (line_count >= 100000) break; - } - } - - gzclose(gz); - return pids; - } - - // Fall back to direct file reading for plain text - std::ifstream file(trace_file); - if (!file.is_open()) { - return pids; - } - - std::string line; - int line_count = 0; - while (std::getline(file, line) && line_count < 100000) { - line_count++; - if (line.empty() || line == "[" || line == "]") continue; - if (!line.empty() && line.back() == ',') line.pop_back(); - - yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0); - if (doc) { - yyjson_val* root = yyjson_doc_get_root(doc); - if (root) { - yyjson_val* pid_val = yyjson_obj_get(root, "pid"); - if (pid_val) { - pids.insert( - static_cast(yyjson_get_uint(pid_val))); - } - } - yyjson_doc_free(doc); - } - } - - return pids; -} - -void MPICallTreeBuilder::distribute_pids() { - // Round-robin distribution using MPIUtils singleton - auto& mpi = mpi::MPIUtils::instance(); - assigned_pids_.clear(); - for (size_t i = static_cast(mpi.get_rank()); i < all_pids_.size(); - i += static_cast(mpi.get_world_size())) { - assigned_pids_.insert(all_pids_[i]); - } +struct DiscoverState { + std::vector my_paths; + std::vector> per_file; +}; - if (config_.verbose) { - DFTRACER_UTILS_LOG_DEBUG("[Rank %d] Assigned %zu PIDs", mpi.get_rank(), - assigned_pids_.size()); - } +coro::CoroTask discover_scan_phase(CoroScope* scope, DiscoverState* st) { + const std::vector* paths_ptr = &st->my_paths; + std::vector>* per_file_ptr = &st->per_file; + co_await scope->scope( + [paths_ptr, + per_file_ptr](CoroScope& child) mutable -> coro::CoroTask { + co_await scan_files_into(&child, paths_ptr, per_file_ptr); + }); } -std::map MPICallTreeBuilder::discover_pids() { - if (!initialized_) { - initialize(); - } - - auto& mpi = mpi::MPIUtils::instance(); - - if (mpi.is_root() && config_.verbose) { - DFTRACER_UTILS_LOG_INFO( - "Phase 1: Discovering PIDs from %zu trace files...", - trace_files_.size()); - } - - // Broadcast file list from rank 0 using MPIUtils - int num_files = static_cast(trace_files_.size()); - mpi.broadcast_int(num_files, 0); +} // namespace - if (!mpi.is_root()) { - trace_files_.resize(num_files); +coro::CoroTask MPICallTreeBuilder::discover_pids(CoroScope* scope) { + auto state = std::make_unique(); + for (std::size_t i = 0; i < trace_files_.size(); ++i) { + if (static_cast(i % static_cast(world_size_)) == + rank_) + state->my_paths.push_back(trace_files_[i]); } + state->per_file.resize(state->my_paths.size()); - for (int i = 0; i < num_files; i++) { - mpi.broadcast_string(trace_files_[i], 0); - } + co_await discover_scan_phase(scope, state.get()); - // Each rank scans files to discover PIDs std::set local_pids; - - for (const auto& trace_file : trace_files_) { - // Create indexer if needed - create_indexer(trace_file); - - // Scan for PIDs - auto file_pids = scan_file_for_pids(trace_file); - local_pids.insert(file_pids.begin(), file_pids.end()); - - // Store PID index info - for (auto pid : file_pids) { - if (pid_index_map_.find(pid) == pid_index_map_.end()) { - pid_index_map_[pid] = PIDIndexInfo(pid, 0, 0, 0, trace_file); - } - } - } - - // Gather all PIDs to rank 0 using MPIUtils - std::vector local_pid_vec(local_pids.begin(), - local_pids.end()); - std::vector all_pids_gathered; - std::vector recv_counts; - std::vector displacements; - - mpi.gatherv_uint32(local_pid_vec, all_pids_gathered, recv_counts, - displacements, 0); - - // Remove duplicates and sort on rank 0 - if (mpi.is_root()) { - std::set unique_pids(all_pids_gathered.begin(), - all_pids_gathered.end()); - all_pids_.assign(unique_pids.begin(), unique_pids.end()); - std::sort(all_pids_.begin(), all_pids_.end()); - - if (config_.verbose) { - DFTRACER_UTILS_LOG_INFO("Discovered %zu unique PIDs", - all_pids_.size()); - } - } - - // Broadcast unique PIDs to all ranks - mpi.broadcast_uint32_vector(all_pids_, 0); - - // Distribute PIDs across ranks - distribute_pids(); - - mpi.barrier(); - - all_pids_.assign(local_pids.begin(), local_pids.end()); - assigned_pids_ = local_pids; - - pids_discovered_ = true; - return pid_index_map_; -} - -bool MPICallTreeBuilder::read_traces_for_pids( - const std::vector& files, - const std::set& pids) { - MPIFilteredTraceReader reader(pids); - return reader.read_multiple(files, *call_tree_); -} - -MPICallTreeResult MPICallTreeBuilder::build() { - MPICallTreeResult result; - auto& mpi = mpi::MPIUtils::instance(); - - if (!pids_discovered_) { - discover_pids(); - } - - if (assigned_pids_.empty()) { - if (config_.verbose) { - DFTRACER_UTILS_LOG_DEBUG( - "[Rank %d] No PIDs assigned, skipping build", mpi.get_rank()); - } - result.success = true; - graphs_built_ = true; - return result; - } - - if (mpi.is_root() && config_.verbose) { - DFTRACER_UTILS_LOG_INFO("%s", "Phase 2: Building call graphs..."); - } - - mpi.barrier(); - - auto start_time = std::chrono::high_resolution_clock::now(); - - // Use pipeline for parallel trace reading - if (config_.num_threads > 0) { - // For now, use simple sequential processing - // Pipeline can be expanded for more complex workflows - read_traces_for_pids(trace_files_, assigned_pids_); - } else { - // Sequential processing - read_traces_for_pids(trace_files_, assigned_pids_); - } - - // Build hierarchy - call_tree_->build_hierarchy(); - - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration elapsed = end_time - start_time; - - result.elapsed_time_s = elapsed.count(); - result.local_pids = assigned_pids_.size(); - result.local_events = 0; - - // Count events - for (const auto& key : call_tree_->keys()) { - auto* graph = call_tree_->get(key); - if (graph) { - result.local_events += graph->calls.size(); - } - } - - // Gather statistics using MPIUtils - mpi.reduce_sum_size_t(result.local_pids, result.total_pids, 0); - mpi.reduce_sum_size_t(result.local_events, result.total_events, 0); - - double max_time = 0; - mpi.reduce_max_double(result.elapsed_time_s, max_time, 0); - result.elapsed_time_s = max_time; - - result.success = true; - graphs_built_ = true; - - if (mpi.is_root() && config_.verbose) { - DFTRACER_UTILS_LOG_INFO("Build completed in %.2f seconds", - result.elapsed_time_s); - DFTRACER_UTILS_LOG_INFO("Total PIDs: %zu", result.total_pids); - DFTRACER_UTILS_LOG_INFO("Total events: %zu", result.total_events); + for (auto& s : state->per_file) local_pids.insert(s.begin(), s.end()); + state.reset(); + + std::vector local_vec(local_pids.begin(), local_pids.end()); + int my_bytes = static_cast(local_vec.size() * sizeof(std::uint32_t)); + std::vector rank_bytes(world_size_, 0); + MPI_Allgather(&my_bytes, 1, MPI_INT, rank_bytes.data(), 1, MPI_INT, + MPI_COMM_WORLD); + std::vector displs(world_size_, 0); + int total = 0; + for (int r = 0; r < world_size_; ++r) { + displs[r] = total; + total += rank_bytes[r]; + } + std::vector gathered(total); + MPI_Allgatherv(local_vec.data(), my_bytes, MPI_CHAR, gathered.data(), + rank_bytes.data(), displs.data(), MPI_CHAR, MPI_COMM_WORLD); + for (int r = 0; r < world_size_; ++r) { + const auto* p = + reinterpret_cast(gathered.data() + displs[r]); + const std::size_t n = rank_bytes[r] / sizeof(std::uint32_t); + for (std::size_t i = 0; i < n; ++i) all_pids_.insert(p[i]); + } + + std::vector sorted_pids(all_pids_.begin(), all_pids_.end()); + for (std::size_t i = static_cast(rank_); + i < sorted_pids.size(); i += static_cast(world_size_)) { + assigned_pids_.insert(sorted_pids[i]); + } + + if (config_.verbose && rank_ == 0) { + DFTRACER_UTILS_LOG_INFO( + "[rank 0] discovered %zu unique pids across %zu " + "files", + all_pids_.size(), trace_files_.size()); } - - return result; + co_return true; } -SerializableProcessGraph MPICallTreeBuilder::convert_to_serializable( - const internal::ProcessCallTree& graph) const { - SerializableProcessGraph result; - result.key = graph.key; - result.root_calls = graph.root_calls; - result.call_sequence = graph.call_sequence; +coro::CoroTask MPICallTreeBuilder::build(CoroScope* scope) { + if (assigned_pids_.empty()) co_return true; - for (const auto& [id, node] : graph.calls) { - SerializableCallNode snode; - snode.id = node->get_id(); - snode.name = node->get_name(); - snode.category = node->get_category(); - snode.start_time = node->get_start_time(); - snode.duration = node->get_duration(); - snode.level = node->get_level(); - snode.parent_id = node->get_parent_id(); - snode.children = node->get_children(); - snode.args = node->get_args(); - result.nodes.push_back(std::move(snode)); + const std::size_t n = trace_files_.size(); + std::vector> per_file; + per_file.reserve(n); + for (std::size_t i = 0; i < n; ++i) { + per_file.push_back(std::make_unique()); + per_file.back()->initialize(); } - return result; -} + const std::set* pids_ptr = &assigned_pids_; + std::atomic total_events{0}; + std::atomic* total_ptr = &total_events; -void MPICallTreeBuilder::merge_from_serializable( - const SerializableProcessGraph& serializable) { - internal::ProcessCallTree& graph = (*call_tree_)[serializable.key]; - graph.key = serializable.key; - graph.root_calls = serializable.root_calls; - graph.call_sequence = serializable.call_sequence; + const std::vector* paths_ptr = &trace_files_; + const std::vector>* per_file_ptr = + &per_file; - for (const auto& snode : serializable.nodes) { - auto node = call_tree_->get_factory().create_node( - snode.id, snode.name, snode.category, snode.start_time, - snode.duration, snode.level, snode.args); - node->set_parent_id(snode.parent_id); - for (auto child_id : snode.children) { - node->add_child(child_id); - } - graph.calls[snode.id] = node; - } -} + co_await scope->scope( + [paths_ptr, per_file_ptr, pids_ptr, + total_ptr](CoroScope& child) mutable -> coro::CoroTask { + co_await ingest_files_into(&child, paths_ptr, per_file_ptr, + pids_ptr, total_ptr); + }); -bool MPICallTreeBuilder::alltoall_graphs() { - auto& mpi = mpi::MPIUtils::instance(); + for (auto& t : per_file) + if (t) call_tree_->merge_from(std::move(*t)); + my_process_keys_ = call_tree_->keys(); - // Serialize local graphs - std::vector local_graphs; - for (const auto& key : call_tree_->keys()) { - auto* graph = call_tree_->get(key); - if (graph) { - local_graphs.push_back(convert_to_serializable(*graph)); - } + if (config_.verbose) { + std::printf("[rank %d/%d] build done: %zu events, %zu processes\n", + rank_, world_size_, total_events.load(), + my_process_keys_.size()); + std::fflush(stdout); + } + co_return true; +} + +coro::CoroTask MPICallTreeBuilder::hierarchy(CoroScope* scope) { + internal::CallTree* tree = call_tree_.get(); + const std::vector* keys_ptr = &my_process_keys_; + co_await scope->scope( + [tree, keys_ptr](CoroScope& child) mutable -> coro::CoroTask { + co_await hierarchy_all(&child, tree, keys_ptr); + }); + co_return true; +} + +coro::CoroTask MPICallTreeBuilder::write(CoroScope* scope, + std::string /*output_path*/, + std::string staging_dir, + bool gzip) { + char suffix[64]; + std::snprintf(suffix, sizeof(suffix), "/rank_%05d.pfw%s", rank_, + gzip ? ".gz" : ""); + my_shard_path_ = staging_dir + suffix; + if (rank_ == 0) { + std::error_code ec; + fs::create_directories(staging_dir, ec); + } + MPI_Barrier(MPI_COMM_WORLD); + + const std::size_t n = my_process_keys_.size(); + std::vector slice_buffers(n); + static constexpr std::uint64_t IDX_STRIDE = 1ull << 20; + + char hostname[256] = {}; + gethostname(hostname, sizeof(hostname) - 1); + std::string hostname_hash(hostname); + + std::vector* slice_buffers_ptr = &slice_buffers; + const std::string* hostname_hash_ptr = &hostname_hash; + const internal::CallTree* tree = call_tree_.get(); + const std::uint64_t rank_base = static_cast(rank_) << 40; + const std::vector* keys_ptr = &my_process_keys_; + + co_await scope->scope( + [tree, keys_ptr, hostname_hash_ptr, slice_buffers_ptr, + rank_base](CoroScope& child) mutable -> coro::CoroTask { + co_await serialize_all(&child, tree, keys_ptr, hostname_hash_ptr, + slice_buffers_ptr, rank_base, IDX_STRIDE); + }); + + std::string header; + if (rank_ == 0) { + header.append("[\n", 2); + internal::JsonSerializer serializer; + char init[8]; + serializer.initialize(init, hostname_hash); + (void)init; + char buf[8192]; + std::time_t now = std::time(nullptr); + char ts[64]; + std::strftime(ts, sizeof(ts), "%Y-%m-%d %H:%M:%S", + std::localtime(&now)); + std::size_t w = serializer.serialize_metadata(buf, "timestamp", ts, "M", + 0, 0, true); + if (w > 0) header.append(buf, w - 1); + header.append(",\n", 2); + w = serializer.serialize_metadata(buf, "format", "call_tree", "M", 0, 0, + true); + if (w > 0) header.append(buf, w - 1); + header.append(",\n", 2); + } + + utilities::fileio::parallel::WriterConfig wc; + wc.layout = utilities::fileio::parallel::FileLayout::SHARDED; + wc.gzip = gzip; + auto writer = utilities::fileio::parallel::make_writer(wc); + + const std::size_t total_workers = (rank_ == 0 ? 1 : 0) + n; + if (total_workers == 0) { + FILE* f = std::fopen(my_shard_path_.c_str(), "wb"); + if (f) std::fclose(f); + co_return true; } - // Serialize to bytes - std::vector send_buffer; - serialization::write_uint32( - send_buffer, static_cast(local_graphs.size())); - for (const auto& graph : local_graphs) { - auto data = graph.serialize(); - serialization::write_uint32(send_buffer, - static_cast(data.size())); - send_buffer.insert(send_buffer.end(), data.begin(), data.end()); + if (co_await writer->open(my_shard_path_, total_workers, gzip, scope) != + 0) { + DFTRACER_UTILS_LOG_ERROR("[rank %d] failed to open writer: %s", rank_, + my_shard_path_.c_str()); + co_return false; } - // Use MPIUtils for allgatherv - std::vector recv_buffer; - std::vector recv_sizes; - std::vector displacements; - - mpi.allgatherv_char(send_buffer, recv_buffer, recv_sizes, displacements); - - // Deserialize graphs from other ranks - int world_size = mpi.get_world_size(); - int rank = mpi.get_rank(); - for (int r = 0; r < world_size; r++) { - if (r == rank) continue; // Skip our own data - - size_t offset = static_cast(displacements[r]); - std::uint32_t num_graphs = - serialization::read_uint32(recv_buffer.data(), offset); - - for (std::uint32_t i = 0; i < num_graphs; i++) { - std::uint32_t graph_size = - serialization::read_uint32(recv_buffer.data(), offset); - (void)graph_size; - auto graph = SerializableProcessGraph::deserialize( - recv_buffer.data(), offset); - merge_from_serializable(graph); + std::size_t widx = 0; + if (rank_ == 0) { + if (co_await writer->write_chunk( + widx++, ByteView(header.data(), header.size())) != 0) { + co_return false; } } - return true; -} - -bool MPICallTreeBuilder::gather() { - if (!graphs_built_) { - return false; - } - - auto& mpi = mpi::MPIUtils::instance(); - - if (mpi.is_root() && config_.verbose) { - DFTRACER_UTILS_LOG_INFO( - "%s", "Phase 3: Gathering call graphs (all-to-all)..."); - } - - mpi.barrier(); - - bool success = alltoall_graphs(); - - mpi.barrier(); - - graphs_gathered_ = success; - - if (mpi.is_root() && config_.verbose) { - DFTRACER_UTILS_LOG_INFO("Gather completed. Total graphs: %zu", - call_tree_->size()); - } - - return success; -} - -bool MPICallTreeBuilder::save(const std::string& filename) const { - // Only rank 0 saves (all ranks have same data after gather) - if (!mpi::MPIUtils::instance().is_root()) { - return true; - } - - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - DFTRACER_UTILS_LOG_ERROR("Cannot open output file: %s", - filename.c_str()); - return false; - } - - // Write header - CallGraphFileHeader header; - header.num_process_graphs = static_cast(call_tree_->size()); - - // Count total events - std::uint64_t total_events = 0; - for (const auto& key : call_tree_->keys()) { - auto* graph = call_tree_->get(key); - if (graph) { - total_events += graph->calls.size(); + for (std::size_t i = 0; i < n; ++i) { + std::string& b = slice_buffers[i]; + const bool last_overall = (i + 1 == n) && (rank_ == world_size_ - 1); + if (last_overall) { + if (b.size() >= 2 && b[b.size() - 2] == ',' && + b[b.size() - 1] == '\n') { + b.resize(b.size() - 2); + b.append("\n]\n", 3); + } else { + b.append("]\n", 2); + } } - } - header.total_events = total_events; - header.data_offset = sizeof(CallGraphFileHeader); - - file.write(reinterpret_cast(&header), sizeof(header)); - - // Write each process graph - for (const auto& key : call_tree_->keys()) { - auto* graph = call_tree_->get(key); - if (graph) { - auto serializable = - const_cast(this)->convert_to_serializable( - *graph); - auto data = serializable.serialize(); - std::uint32_t size = static_cast(data.size()); - file.write(reinterpret_cast(&size), sizeof(size)); - file.write(data.data(), data.size()); + if (co_await writer->write_chunk(widx++, + ByteView(b.data(), b.size())) != 0) { + co_return false; } } - if (config_.verbose) { - DFTRACER_UTILS_LOG_INFO("Saved call graph to %s", filename.c_str()); - } - - return true; -} - -std::unique_ptr MPICallTreeBuilder::load( - const std::string& filename) { - std::ifstream file(filename, std::ios::binary); - if (!file.is_open()) { - DFTRACER_UTILS_LOG_ERROR("Cannot open file: %s", filename.c_str()); - return nullptr; - } - - // Read header - CallGraphFileHeader header; - file.read(reinterpret_cast(&header), sizeof(header)); - - if (!header.is_valid()) { - DFTRACER_UTILS_LOG_ERROR("%s", "Invalid call graph file format"); - return nullptr; - } - - auto call_graph = std::make_unique(); - call_graph->initialize(); + if (co_await writer->close() != 0) co_return false; - // Read each process graph - for (std::uint32_t i = 0; i < header.num_process_graphs; i++) { - std::uint32_t size; - file.read(reinterpret_cast(&size), sizeof(size)); - - std::vector data(size); - file.read(data.data(), size); - - size_t offset = 0; - auto serializable = - SerializableProcessGraph::deserialize(data.data(), offset); - - // Merge into call graph - internal::ProcessCallTree& graph = (*call_graph)[serializable.key]; - graph.key = serializable.key; - graph.root_calls = serializable.root_calls; - graph.call_sequence = serializable.call_sequence; - - for (const auto& snode : serializable.nodes) { - auto node = call_graph->get_factory().create_node( - snode.id, snode.name, snode.category, snode.start_time, - snode.duration, snode.level, snode.args); - node->set_parent_id(snode.parent_id); - for (auto child_id : snode.children) { - node->add_child(child_id); - } - graph.calls[snode.id] = node; + auto shards = writer->output_paths(); + if (shards.size() > 1) { + if (co_await utilities::fileio::parallel::merge_shards(my_shard_path_, + shards) != 0) { + DFTRACER_UTILS_LOG_ERROR("[rank %d] local merge failed", rank_); + co_return false; } } - - return call_graph; + co_return true; } -void MPICallTreeBuilder::print_summary() const { - auto& mpi = mpi::MPIUtils::instance(); - std::size_t local_graphs = call_tree_->size(); - std::size_t local_events = 0; +coro::CoroTask MPICallTreeBuilder::merge(std::string output_path, + std::string staging_dir, + bool gzip, bool keep_staging) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank_ != 0) co_return true; - for (const auto& key : call_tree_->keys()) { - auto* graph = call_tree_->get(key); - if (graph) { - local_events += graph->calls.size(); - } + std::vector shards; + shards.reserve(world_size_); + for (int r = 0; r < world_size_; ++r) { + char rs[64]; + std::snprintf(rs, sizeof(rs), "/rank_%05d.pfw%s", r, gzip ? ".gz" : ""); + shards.emplace_back(staging_dir + rs); } - - std::size_t total_graphs = 0; - std::size_t total_events = 0; - - // Use MPIUtils for reduce operations - const_cast(mpi).reduce_sum_size_t(local_graphs, - total_graphs, 0); - const_cast(mpi).reduce_sum_size_t(local_events, - total_events, 0); - - if (mpi.is_root()) { - DFTRACER_UTILS_LOG_INFO( - "%s", "\n============ MPI Call Graph Summary ============"); - DFTRACER_UTILS_LOG_INFO("MPI Ranks: %d", mpi.get_world_size()); - DFTRACER_UTILS_LOG_INFO("Total PIDs: %zu", all_pids_.size()); - DFTRACER_UTILS_LOG_INFO("Total process graphs: %zu", total_graphs); - DFTRACER_UTILS_LOG_INFO("Total events: %zu", total_events); - DFTRACER_UTILS_LOG_INFO( - "%s", "================================================\n"); + if (co_await utilities::fileio::parallel::merge_shards(output_path, + shards) != 0) { + DFTRACER_UTILS_LOG_ERROR("merge_shards failed for %s", + output_path.c_str()); + co_return false; } - - // Each rank prints its summary - int world_size = mpi.get_world_size(); - int rank = mpi.get_rank(); - for (int r = 0; r < world_size; r++) { - if (r == rank) { - DFTRACER_UTILS_LOG_INFO("[Rank %d] Local Summary:", rank); - DFTRACER_UTILS_LOG_INFO(" Assigned PIDs: %zu", - assigned_pids_.size()); - DFTRACER_UTILS_LOG_INFO(" Process graphs: %zu", local_graphs); - DFTRACER_UTILS_LOG_INFO(" Events: %zu", local_events); - } - const_cast(mpi).barrier(); + if (!keep_staging) { + std::error_code ec; + fs::remove_all(staging_dir, ec); } + co_return true; } } // namespace dftracer::utils::call_tree diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp new file mode 100644 index 00000000..f3a4be00 --- /dev/null +++ b/src/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp @@ -0,0 +1,391 @@ +// Arrow IPC save/load for in-memory CallTree. Produces a .arrow file with a +// single record batch (zstd buffer-level compression by default) consumable +// by pyarrow / polars / nanoarrow / dfanalyzer. +// +// Schema (one row per CallTreeNode, rows grouped by ProcessKey and ordered +// by call_sequence within each group): +// +// pid uint64 +// tid uint64 +// node_pkid uint64 // ProcessKey.node_id +// id uint64 // node id +// name utf8 // ZSTD compresses repeated values well +// category utf8 +// start_time uint64 +// duration uint64 +// level int64 +// parent_id uint64 +// is_root bool // node is in ProcessCallTree::root_calls +// seq_idx int64 // position in ProcessCallTree::call_sequence +// children utf8 // ',' joined child ids +// arg_keys utf8 // '\x1f' (US sep) joined keys +// arg_values utf8 // '\x1f' joined stringified values + +#include +#include +#include +#include +#include +#include +#include + +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#endif + +namespace dftracer::utils::call_tree { + +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +namespace { + +using utilities::common::arrow::ArrowExportResult; +using utilities::common::arrow::ColumnSpec; +using utilities::common::arrow::ColumnType; +using utilities::common::arrow::IpcCompression; +using utilities::common::arrow::IpcReader; +using utilities::common::arrow::IpcWriter; +using utilities::common::arrow::RecordBatchBuilder; +using utilities::composites::dft::ArgsValueProxy; + +constexpr char ARG_SEP = '\x1f'; + +void join_uint64(std::string& out, const std::vector& v) { + out.clear(); + bool first = true; + for (auto x : v) { + if (!first) out.push_back(','); + out.append(std::to_string(x)); + first = false; + } +} + +std::string args_value_to_string(ArgsValueProxy v) { + if (v.is_string()) return v.get(); + if (v.is_uint()) return std::to_string(v.get()); + if (v.is_int()) return std::to_string(v.get()); + if (v.is_number()) return std::to_string(v.get()); + if (v.is_bool()) return v.get() ? "true" : "false"; + return {}; +} + +dftracer::utils::StringIntern& arrow_load_intern() { + static dftracer::utils::StringIntern instance; + return instance; +} + +// Split `s` on `delim`; preserves empty tokens. +std::vector split_view(std::string_view s, char delim) { + std::vector out; + if (s.empty()) return out; + std::size_t start = 0; + for (std::size_t i = 0; i < s.size(); ++i) { + if (s[i] == delim) { + out.emplace_back(s.data() + start, i - start); + start = i + 1; + } + } + out.emplace_back(s.data() + start, s.size() - start); + return out; +} + +std::uint64_t parse_u64(std::string_view s) { + std::uint64_t v = 0; + for (char c : s) { + if (c < '0' || c > '9') break; + v = v * 10 + static_cast(c - '0'); + } + return v; +} + +// Locate the index of column `name` in a flat record-batch schema. +int find_column(ArrowSchema* schema, const char* name) { + if (!schema || !schema->children) return -1; + for (int i = 0; i < schema->n_children; ++i) { + if (schema->children[i] && schema->children[i]->name && + std::strcmp(schema->children[i]->name, name) == 0) { + return i; + } + } + return -1; +} + +// Pull a string out of an Arrow string-view: plain utf8 returns the +// underlying buffer; dictionary resolves the dictionary entry. +std::string_view get_string(const ArrowArrayView* view, std::int64_t row) { + if (view->dictionary != nullptr) { + const std::int64_t idx = ArrowArrayViewGetIntUnsafe(view, row); + auto s = ArrowArrayViewGetStringUnsafe(view->dictionary, idx); + return std::string_view(s.data, static_cast(s.size_bytes)); + } + auto s = ArrowArrayViewGetStringUnsafe(view, row); + return std::string_view(s.data, static_cast(s.size_bytes)); +} + +} // namespace + +coro::CoroTask save_arrow(CoroScope* /*scope*/, + const internal::CallTree& tree, + std::string output_path) { + RecordBatchBuilder builder; + builder.declare_schema({ + {"pid", ColumnType::UINT64}, + {"tid", ColumnType::UINT64}, + {"node_pkid", ColumnType::UINT64}, + {"id", ColumnType::UINT64}, + {"name", ColumnType::STRING}, + {"category", ColumnType::STRING}, + {"start_time", ColumnType::UINT64}, + {"duration", ColumnType::UINT64}, + {"level", ColumnType::INT64}, + {"parent_id", ColumnType::UINT64}, + {"is_root", ColumnType::BOOL}, + {"seq_idx", ColumnType::INT64}, + {"children", ColumnType::STRING}, + {"arg_keys", ColumnType::STRING}, + {"arg_values", ColumnType::STRING}, + }); + + auto keys = const_cast(tree).keys(); + std::string children_join; + std::string arg_keys_join; + std::string arg_values_join; + + for (const auto& key : keys) { + auto* graph = const_cast(tree).get(key); + if (!graph) continue; + + std::unordered_set root_set(graph->root_calls.begin(), + graph->root_calls.end()); + + std::int64_t seq_idx = 0; + for (std::uint64_t nid : graph->call_sequence) { + auto it = graph->calls.find(nid); + if (it == graph->calls.end()) continue; + const auto& node = it->second; + if (!node) continue; + + builder.append_uint64(0, key.pid); + builder.append_uint64(1, key.tid); + builder.append_uint64(2, key.node_id); + builder.append_uint64(3, node->get_id()); + builder.append_string(4, node->get_name()); + builder.append_string(5, node->get_category()); + builder.append_uint64(6, node->get_start_time()); + builder.append_uint64(7, node->get_duration()); + builder.append_int64(8, + static_cast(node->get_level())); + builder.append_uint64(9, node->get_parent_id()); + builder.append_bool(10, root_set.count(nid) > 0); + builder.append_int64(11, seq_idx++); + + join_uint64(children_join, node->get_children()); + builder.append_string(12, children_join); + + arg_keys_join.clear(); + arg_values_join.clear(); + bool first = true; + node->get_args().for_each_member( + [&](std::string_view k, ArgsValueProxy v) { + if (!first) { + arg_keys_join.push_back(ARG_SEP); + arg_values_join.push_back(ARG_SEP); + } + arg_keys_join.append(k); + arg_values_join.append(args_value_to_string(v)); + first = false; + }); + builder.append_string(13, arg_keys_join); + builder.append_string(14, arg_values_join); + + builder.end_row(); + } + } + + if (builder.num_rows() == 0) { + DFTRACER_UTILS_LOG_WARN("save_arrow: tree is empty, writing empty %s", + output_path.c_str()); + } + + auto batch = builder.finish(); + IpcWriter writer; + if (co_await writer.open(output_path, IpcCompression::ZSTD) != 0) { + DFTRACER_UTILS_LOG_ERROR("save_arrow: open failed: %s", + output_path.c_str()); + co_return false; + } + if (co_await writer.write_batch(batch) != 0) { + DFTRACER_UTILS_LOG_ERROR("%s", "save_arrow: write_batch failed"); + co_return false; + } + if (co_await writer.close() != 0) { + DFTRACER_UTILS_LOG_ERROR("%s", "save_arrow: close failed"); + co_return false; + } + co_return true; +} + +coro::CoroTask> load_arrow( + CoroScope* /*scope*/, std::string input_path) { + IpcReader reader; + if (reader.open(input_path) != 0) { + DFTRACER_UTILS_LOG_ERROR("load_arrow: open failed: %s", + input_path.c_str()); + co_return nullptr; + } + + auto tree = std::make_unique(); + tree->initialize(); + + using utilities::composites::dft::ArgsMap; + + auto process_batch = [&](ArrowExportResult& batch) -> int { + ArrowSchema* schema = batch.get_schema(); + ArrowArray* array = batch.get_array(); + if (!schema || !array) return -1; + + const int c_pid = find_column(schema, "pid"); + const int c_tid = find_column(schema, "tid"); + const int c_node_pkid = find_column(schema, "node_pkid"); + const int c_id = find_column(schema, "id"); + const int c_name = find_column(schema, "name"); + const int c_cat = find_column(schema, "category"); + const int c_start = find_column(schema, "start_time"); + const int c_dur = find_column(schema, "duration"); + const int c_level = find_column(schema, "level"); + const int c_parent = find_column(schema, "parent_id"); + const int c_isroot = find_column(schema, "is_root"); + const int c_seq = find_column(schema, "seq_idx"); + const int c_children = find_column(schema, "children"); + const int c_argk = find_column(schema, "arg_keys"); + const int c_argv = find_column(schema, "arg_values"); + if (c_pid < 0 || c_tid < 0 || c_node_pkid < 0 || c_id < 0 || + c_name < 0 || c_cat < 0 || c_start < 0 || c_dur < 0 || + c_level < 0 || c_parent < 0 || c_isroot < 0 || c_seq < 0 || + c_children < 0 || c_argk < 0 || c_argv < 0) { + DFTRACER_UTILS_LOG_ERROR("%s", + "load_arrow: schema missing required " + "columns"); + return -1; + } + + ArrowArrayView view; + ArrowError err; + if (ArrowArrayViewInitFromSchema(&view, schema, &err) != NANOARROW_OK) { + DFTRACER_UTILS_LOG_ERROR("load_arrow: InitFromSchema: %s", + err.message); + return -1; + } + struct ViewGuard { + ArrowArrayView* v; + ~ViewGuard() { ArrowArrayViewReset(v); } + } guard{&view}; + if (ArrowArrayViewSetArray(&view, array, &err) != NANOARROW_OK) { + DFTRACER_UTILS_LOG_ERROR("load_arrow: SetArray: %s", err.message); + return -1; + } + + const std::int64_t n = array->length; + for (std::int64_t i = 0; i < n; ++i) { + const std::uint64_t pid = + ArrowArrayViewGetUIntUnsafe(view.children[c_pid], i); + const std::uint64_t tid = + ArrowArrayViewGetUIntUnsafe(view.children[c_tid], i); + const std::uint64_t node_pkid = + ArrowArrayViewGetUIntUnsafe(view.children[c_node_pkid], i); + const std::uint64_t id = + ArrowArrayViewGetUIntUnsafe(view.children[c_id], i); + const std::uint64_t start = + ArrowArrayViewGetUIntUnsafe(view.children[c_start], i); + const std::uint64_t dur = + ArrowArrayViewGetUIntUnsafe(view.children[c_dur], i); + const std::int64_t level = + ArrowArrayViewGetIntUnsafe(view.children[c_level], i); + const std::uint64_t parent = + ArrowArrayViewGetUIntUnsafe(view.children[c_parent], i); + const bool is_root = + ArrowArrayViewGetIntUnsafe(view.children[c_isroot], i) != 0; + + auto name_sv = get_string(view.children[c_name], i); + auto cat_sv = get_string(view.children[c_cat], i); + auto children_sv = get_string(view.children[c_children], i); + auto argk_sv = get_string(view.children[c_argk], i); + auto argv_sv = get_string(view.children[c_argv], i); + + // Args round-trip as strings; type info is lost vs the typed + // custom-binary format. + ArgsMap args; + auto keys_tok = split_view(argk_sv, ARG_SEP); + auto vals_tok = split_view(argv_sv, ARG_SEP); + const std::size_t n_args = + std::min(keys_tok.size(), vals_tok.size()); + if (n_args > 0) args.set_valid(true); + for (std::size_t k = 0; k < n_args; ++k) { + args.insert(keys_tok[k], std::string(vals_tok[k])); + } + + auto name_interned = arrow_load_intern().intern(name_sv); + auto cat_interned = arrow_load_intern().intern(cat_sv); + + auto node = tree->get_factory().create_node( + id, name_interned, cat_interned, start, dur, + static_cast(level), std::move(args)); + node->set_parent_id(parent); + for (auto child_sv : split_view(children_sv, ',')) { + if (child_sv.empty()) continue; + node->add_child(parse_u64(child_sv)); + } + + internal::ProcessKey key(static_cast(pid), + static_cast(tid), + static_cast(node_pkid)); + tree->add_call(key, node); + + // add_call already appended id to call_sequence (in row order, + // which is the saved call_sequence order). Only push the root + // flag here. + auto* pgraph = tree->get(key); + if (pgraph && is_root) pgraph->root_calls.push_back(id); + } + return 0; + }; + + if (reader.for_each_batch(process_batch) != 0) { + DFTRACER_UTILS_LOG_ERROR("%s", "load_arrow: batch iteration failed"); + co_return nullptr; + } + co_return tree; +} + +#else // !DFTRACER_UTILS_ENABLE_ARROW_IPC + +coro::CoroTask save_arrow(CoroScope* /*scope*/, + const internal::CallTree& /*tree*/, + std::string /*output_path*/) { + DFTRACER_UTILS_LOG_ERROR("%s", + "save_arrow: build without DFTRACER_UTILS_ENABLE_" + "ARROW_IPC, cannot write Arrow IPC"); + co_return false; +} + +coro::CoroTask> load_arrow( + CoroScope* /*scope*/, std::string /*input_path*/) { + DFTRACER_UTILS_LOG_ERROR("%s", "load_arrow: arrow IPC disabled"); + co_return nullptr; +} + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC + +} // namespace dftracer::utils::call_tree diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp new file mode 100644 index 00000000..32eb3b2c --- /dev/null +++ b/src/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp @@ -0,0 +1,429 @@ +// Compact custom-binary save/load for in-memory CallTree. +// +// On-disk layout (little-endian, all multi-byte fields native u32/u64/...): +// +// magic[8] = "DFTCGRP2" +// version u32 = 2 +// flags u32 (reserved, currently 0) +// string_table: +// count u32 +// for each: u32 length + raw bytes (utf-8; embedded NULs OK) +// process_count u32 +// for each ProcessCallTree: +// pid u32, tid u32, node_id u32 +// call_count u32 +// for each CallTreeNode: +// id u64 +// name_str_id u32, cat_str_id u32 +// start_time u64, duration u64 +// level i32, parent_id u64 +// child_count u32, then u64 ids +// arg_count u32, then per arg: +// key_str_id u32 +// type u8 { 0:string-id-u32, 1:u64, 2:i64, 3:double, 4:bool-u8 } +// payload (type-dependent) +// root_count u32, then u64 ids +// seq_count u32, then u64 ids + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::call_tree { + +namespace { + +using utilities::composites::dft::ArgsMap; +using utilities::composites::dft::ArgsValueProxy; + +enum ArgTypeTag : std::uint8_t { + ARG_STRING = 0, + ARG_U64 = 1, + ARG_I64 = 2, + ARG_DOUBLE = 3, + ARG_BOOL = 4, +}; + +dftracer::utils::StringIntern& binary_load_intern() { + static dftracer::utils::StringIntern instance; + return instance; +} + +void append_bytes(std::vector& out, const void* p, std::size_t n) { + out.insert(out.end(), static_cast(p), + static_cast(p) + n); +} + +template +void put_pod(std::vector& out, T v) { + append_bytes(out, &v, sizeof(v)); +} + +// Builder maintains the global string table and assigns dense ids; identical +// strings get the same id. `strings_` is a deque (not vector) so push_back +// keeps existing element addresses stable -- the index_ map stores +// string_views into those elements, and a vector realloc would dangle them +// (especially nasty for SSO strings whose storage lives inside the string +// object). +class StringTable { + public: + std::uint32_t intern(std::string_view s) { + auto it = index_.find(s); + if (it != index_.end()) return it->second; + const std::uint32_t id = static_cast(strings_.size()); + strings_.emplace_back(s); + index_.emplace(std::string_view(strings_.back()), id); + return id; + } + + void write(std::vector& out) const { + put_pod(out, + static_cast(strings_.size())); + for (const auto& s : strings_) { + put_pod(out, static_cast(s.size())); + append_bytes(out, s.data(), s.size()); + } + } + + private: + std::deque strings_; + ankerl::unordered_dense::map index_; +}; + +// Cursor over an in-memory buffer; tracks bounds and an `ok` flag so callers +// can early-exit on truncation without per-call error plumbing. +struct Cursor { + const char* p; + const char* end; + bool ok = true; + + template + bool get_pod(T& out) { + if (end - p < static_cast(sizeof(T))) { + ok = false; + return false; + } + std::memcpy(&out, p, sizeof(T)); + p += sizeof(T); + return true; + } + bool get_string_view(std::string_view& out, std::uint32_t len) { + if (end - p < static_cast(len)) { + ok = false; + return false; + } + out = std::string_view(p, len); + p += len; + return true; + } +}; + +void serialize_node(std::vector& out, StringTable& strings, + const internal::CallTreeNode& n) { + put_pod(out, n.get_id()); + put_pod(out, strings.intern(n.get_name())); + put_pod(out, strings.intern(n.get_category())); + put_pod(out, n.get_start_time()); + put_pod(out, n.get_duration()); + put_pod(out, static_cast(n.get_level())); + put_pod(out, n.get_parent_id()); + + const auto& children = n.get_children(); + put_pod(out, static_cast(children.size())); + for (auto id : children) put_pod(out, id); + + // Pull args out as (key, ArgsValueProxy) so we can preserve typing. + std::vector> args; + n.get_args().for_each_member( + [&](std::string_view k, ArgsValueProxy v) { args.emplace_back(k, v); }); + + put_pod(out, static_cast(args.size())); + for (auto& [k, v] : args) { + put_pod(out, strings.intern(k)); + if (v.is_string()) { + put_pod(out, ARG_STRING); + const auto s = v.get(); + put_pod(out, strings.intern(s)); + } else if (v.is_uint()) { + put_pod(out, ARG_U64); + put_pod(out, v.get()); + } else if (v.is_int()) { + put_pod(out, ARG_I64); + put_pod(out, v.get()); + } else if (v.is_number()) { + put_pod(out, ARG_DOUBLE); + put_pod(out, v.get()); + } else if (v.is_bool()) { + put_pod(out, ARG_BOOL); + put_pod(out, v.get() ? 1 : 0); + } else { + put_pod(out, ARG_STRING); + put_pod(out, strings.intern("")); + } + } +} + +} // namespace + +coro::CoroTask save_binary(CoroScope* scope, + const internal::CallTree& tree, + std::string output_path) { + // First pass: emit body to a scratch buffer while populating the + // string table. Then write header + table + body. + std::vector body; + body.reserve(1 << 20); + StringTable strings; + + auto keys = const_cast(tree).keys(); + put_pod(body, static_cast(keys.size())); + + for (const auto& key : keys) { + auto* graph = const_cast(tree).get(key); + if (!graph) continue; + + put_pod(body, key.pid); + put_pod(body, key.tid); + put_pod(body, key.node_id); + put_pod(body, + static_cast(graph->calls.size())); + for (const auto& [id, node] : graph->calls) { + if (node) serialize_node(body, strings, *node); + } + put_pod( + body, static_cast(graph->root_calls.size())); + for (auto id : graph->root_calls) put_pod(body, id); + put_pod( + body, static_cast(graph->call_sequence.size())); + for (auto id : graph->call_sequence) put_pod(body, id); + } + + std::vector out; + out.reserve(8 + 4 + 4 + body.size() + 16); + append_bytes(out, CALLTREE_BINARY_MAGIC, sizeof(CALLTREE_BINARY_MAGIC)); + put_pod(out, CALLTREE_BINARY_VERSION); + put_pod(out, 0u); // flags + strings.write(out); + append_bytes(out, body.data(), body.size()); + + utilities::fileio::parallel::WriterConfig wc; + wc.layout = utilities::fileio::parallel::FileLayout::STRIPED; + wc.gzip = false; + auto writer = utilities::fileio::parallel::make_writer(wc); + if (co_await writer->open(output_path, 1, false, scope) != 0) { + DFTRACER_UTILS_LOG_ERROR("save_binary: open failed: %s", + output_path.c_str()); + co_return false; + } + if (co_await writer->write_chunk(0, ByteView(out.data(), out.size())) != + 0) { + co_return false; + } + if (co_await writer->close() != 0) co_return false; + co_return true; +} + +coro::CoroTask> load_binary( + CoroScope* /*scope*/, std::string input_path) { + int fd = ::open(input_path.c_str(), O_RDONLY); + if (fd < 0) { + DFTRACER_UTILS_LOG_ERROR("load_binary: cannot open %s", + input_path.c_str()); + co_return nullptr; + } + struct stat st; + if (::fstat(fd, &st) != 0 || st.st_size <= 0) { + ::close(fd); + co_return nullptr; + } + std::vector buf(static_cast(st.st_size)); + std::size_t got = 0; + while (got < buf.size()) { + ssize_t n = ::read(fd, buf.data() + got, buf.size() - got); + if (n <= 0) break; + got += static_cast(n); + } + ::close(fd); + if (got != buf.size()) co_return nullptr; + + Cursor c{buf.data(), buf.data() + buf.size()}; + if (c.end - c.p < 8 || std::memcmp(c.p, CALLTREE_BINARY_MAGIC, + sizeof(CALLTREE_BINARY_MAGIC)) != 0) { + DFTRACER_UTILS_LOG_ERROR("load_binary: bad magic in %s", + input_path.c_str()); + co_return nullptr; + } + c.p += 8; + std::uint32_t version = 0, flags = 0; + if (!c.get_pod(version) || !c.get_pod(flags)) co_return nullptr; + if (version != CALLTREE_BINARY_VERSION) { + DFTRACER_UTILS_LOG_ERROR("load_binary: unsupported version %u", + version); + co_return nullptr; + } + + std::uint32_t nstr = 0; + if (!c.get_pod(nstr)) co_return nullptr; + std::vector table; + table.reserve(nstr); + for (std::uint32_t i = 0; i < nstr && c.ok; ++i) { + std::uint32_t len = 0; + std::string_view s; + if (!c.get_pod(len) || !c.get_string_view(s, len)) co_return nullptr; + table.push_back(s); + } + auto lookup_str = [&](std::uint32_t id) -> std::string_view { + return id < table.size() ? table[id] : std::string_view{}; + }; + + auto tree = std::make_unique(); + tree->initialize(); + + std::uint32_t nprocs = 0; + if (!c.get_pod(nprocs)) co_return nullptr; + + for (std::uint32_t pi = 0; pi < nprocs && c.ok; ++pi) { + std::uint32_t pid = 0, tid = 0, node_id = 0, ncalls = 0; + if (!c.get_pod(pid) || !c.get_pod(tid) || !c.get_pod(node_id) || + !c.get_pod(ncalls)) + break; + internal::ProcessKey key(pid, tid, node_id); + + for (std::uint32_t ci = 0; ci < ncalls && c.ok; ++ci) { + std::uint64_t id = 0, start = 0, dur = 0, parent = 0; + std::uint32_t name_id = 0, cat_id = 0; + std::int32_t level = 0; + if (!c.get_pod(id) || !c.get_pod(name_id) || !c.get_pod(cat_id) || + !c.get_pod(start) || !c.get_pod(dur) || !c.get_pod(level) || + !c.get_pod(parent)) + break; + + std::uint32_t nchildren = 0; + if (!c.get_pod(nchildren)) break; + std::vector children; + children.reserve(nchildren); + for (std::uint32_t k = 0; k < nchildren && c.ok; ++k) { + std::uint64_t cid = 0; + if (!c.get_pod(cid)) break; + children.push_back(cid); + } + + std::uint32_t nargs = 0; + if (!c.get_pod(nargs)) break; + ArgsMap args; + if (nargs > 0) args.set_valid(true); + for (std::uint32_t k = 0; k < nargs && c.ok; ++k) { + std::uint32_t key_id = 0; + std::uint8_t type = 0; + if (!c.get_pod(key_id) || !c.get_pod(type)) break; + auto key_sv = lookup_str(key_id); + switch (type) { + case ARG_STRING: { + std::uint32_t val_id = 0; + if (!c.get_pod(val_id)) { + c.ok = false; + break; + } + args.insert(key_sv, std::string(lookup_str(val_id))); + break; + } + case ARG_U64: { + std::uint64_t v = 0; + if (!c.get_pod(v)) { + c.ok = false; + break; + } + args.insert(key_sv, v); + break; + } + case ARG_I64: { + std::int64_t v = 0; + if (!c.get_pod(v)) { + c.ok = false; + break; + } + args.insert(key_sv, v); + break; + } + case ARG_DOUBLE: { + double v = 0; + if (!c.get_pod(v)) { + c.ok = false; + break; + } + args.insert(key_sv, v); + break; + } + case ARG_BOOL: { + std::uint8_t v = 0; + if (!c.get_pod(v)) { + c.ok = false; + break; + } + args.insert(key_sv, v != 0); + break; + } + default: + c.ok = false; + break; + } + } + + auto name = binary_load_intern().intern(lookup_str(name_id)); + auto cat = binary_load_intern().intern(lookup_str(cat_id)); + auto node = tree->get_factory().create_node( + id, name, cat, start, dur, static_cast(level), + std::move(args)); + node->set_parent_id(parent); + for (auto cid : children) node->add_child(cid); + tree->add_call(key, node); + } + + auto* pgraph = tree->get(key); + if (!pgraph) continue; + // add_call already appended each new node id into call_sequence in + // insertion order; the saved roots/sequence are authoritative, so + // clear before replacing. + pgraph->root_calls.clear(); + pgraph->call_sequence.clear(); + std::uint32_t nroots = 0; + if (!c.get_pod(nroots)) break; + for (std::uint32_t k = 0; k < nroots && c.ok; ++k) { + std::uint64_t id = 0; + if (!c.get_pod(id)) break; + pgraph->root_calls.push_back(id); + } + std::uint32_t nseq = 0; + if (!c.get_pod(nseq)) break; + for (std::uint32_t k = 0; k < nseq && c.ok; ++k) { + std::uint64_t id = 0; + if (!c.get_pod(id)) break; + pgraph->call_sequence.push_back(id); + } + } + + if (!c.ok) { + DFTRACER_UTILS_LOG_ERROR("load_binary: truncated/malformed file %s", + input_path.c_str()); + co_return nullptr; + } + co_return tree; +} + +} // namespace dftracer::utils::call_tree diff --git a/src/dftracer/utils/utilities/call_tree/json_serializer.cpp b/src/dftracer/utils/utilities/call_tree/json_serializer.cpp index 6b667e65..ee1bb394 100644 --- a/src/dftracer/utils/utilities/call_tree/json_serializer.cpp +++ b/src/dftracer/utils/utilities/call_tree/json_serializer.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -7,167 +8,147 @@ namespace dftracer::utils::call_tree { namespace internal { +using dftracer::utils::utilities::composites::dft::ArgsValueProxy; + JsonSerializer::JsonSerializer() : hostname_hash_("") {} size_t JsonSerializer::initialize(char* buffer, const std::string& hostname_hash) { hostname_hash_ = hostname_hash; - // Write opening bracket for JSON array (Chrome Tracing format requirement) buffer[0] = '['; buffer[1] = '\n'; return 2; } -bool JsonSerializer::convert_args_to_json( - const std::unordered_map& args, - std::stringstream& stream) { - if (args.empty()) { - return false; - } +bool JsonSerializer::convert_args_to_json(const ArgsMap& args, + std::stringstream& stream) { + if (!args) return false; - // Known fields that should always be strings (hash values, etc.) - const std::set string_fields = {"hhash", "fhash", "exec_hash", - "cmd_hash", "hostname_hash"}; + static const std::set> string_fields = { + "hhash", "fhash", "exec_hash", "cmd_hash", "hostname_hash"}; size_t count = 0; - for (const auto& [key, value] : args) { - // Add comma separator if not first element - if (count > 0) { - stream << ","; - } + bool any = false; + args.for_each_member([&](std::string_view key, ArgsValueProxy v) { + if (count > 0) stream << ","; + count++; + any = true; - // Check if this field should always be a string - bool force_string = (string_fields.find(key) != string_fields.end()); - - // Check if value looks like a pure number - // To be safe, only treat it as a number if: - // 1. Not a known string field - // 2. It doesn't contain any letters (handles hex strings like - // "df57e0a251b84b54") - // 3. It successfully parses as a number - // 4. The entire string was consumed during parsing - bool is_number = false; - if (!force_string && !value.empty()) { - // First check: no alphabetic characters - bool has_alpha = false; - for (char c : value) { - if (std::isalpha(c)) { - has_alpha = true; - break; - } - } + stream << "\"" << key << "\":"; - // Only try to parse as number if no alphabetic chars - if (!has_alpha && (std::isdigit(value[0]) || value[0] == '-' || - value[0] == '+')) { - char* end; - // Try integer parse - std::strtoll(value.c_str(), &end, 10); - if (end && *end == '\0') { - is_number = true; - } else { - // Try float parse - std::strtod(value.c_str(), &end); - if (end && *end == '\0') { + if (v.is_string()) { + std::string sv = v.get(); + bool force_string = string_fields.find(key) != string_fields.end(); + bool is_number = false; + if (!force_string && !sv.empty()) { + bool has_alpha = false; + for (char c : sv) { + if (std::isalpha(static_cast(c))) { + has_alpha = true; + break; + } + } + if (!has_alpha && + (std::isdigit(static_cast(sv[0])) || + sv[0] == '-' || sv[0] == '+')) { + char* end; + std::strtoll(sv.c_str(), &end, 10); + if (end && *end == '\0') is_number = true; + else { + std::strtod(sv.c_str(), &end); + if (end && *end == '\0') is_number = true; } } } - } - - // Format as JSON key-value pair - stream << "\"" << key << "\":"; - if (is_number) { - stream << value; - } else { - // Escape special characters in string values - stream << "\""; - for (char c : value) { - switch (c) { - case '"': - stream << "\\\""; - break; - case '\\': - stream << "\\\\"; - break; - case '\n': - stream << "\\n"; - break; - case '\r': - stream << "\\r"; - break; - case '\t': - stream << "\\t"; - break; - default: - stream << c; - break; + if (is_number) { + stream << sv; + } else { + stream << "\""; + for (char c : sv) { + switch (c) { + case '"': + stream << "\\\""; + break; + case '\\': + stream << "\\\\"; + break; + case '\n': + stream << "\\n"; + break; + case '\r': + stream << "\\r"; + break; + case '\t': + stream << "\\t"; + break; + default: + stream << c; + break; + } } + stream << "\""; } - stream << "\""; + } else if (v.is_uint()) { + stream << v.get(); + } else if (v.is_int()) { + stream << v.get(); + } else if (v.is_number()) { + stream << v.get(); + } else if (v.is_bool()) { + stream << (v.get() ? "true" : "false"); + } else { + stream << "null"; } + }); - count++; - } - - return true; + return any; } size_t JsonSerializer::serialize_node(char* buffer, int index, const CallTreeNode& node, std::uint32_t process_id, std::uint32_t thread_id) { - size_t written_size = 0; - - // Get node data const auto& args = node.get_args(); - // Build args JSON string if present std::stringstream args_stream; bool has_args = convert_args_to_json(args, args_stream); - // Build complete args object including hostname hash and metadata std::stringstream all_args; - // Check if args already has hhash, if not add it - bool has_hhash = args.find("hhash") != args.end(); + bool has_hhash = args["hhash"].exists(); if (!has_hhash && !hostname_hash_.empty()) { all_args << "\"hhash\":\"" << hostname_hash_ << "\""; } - // Check if args already has level, if not add it - bool has_level = args.find("level") != args.end(); + bool has_level = args["level"].exists(); if (!has_level) { if (all_args.str().size() > 0) all_args << ","; all_args << "\"level\":" << node.get_level(); } - // Add parent_id if not root and not already in args - bool has_parent = args.find("parent_id") != args.end(); + bool has_parent = args["parent_id"].exists(); if (node.get_parent_id() != 0 && !has_parent) { if (all_args.str().size() > 0) all_args << ","; all_args << "\"parent_id\":" << node.get_parent_id(); } - // Add custom args if present if (has_args) { if (all_args.str().size() > 0) all_args << ","; all_args << args_stream.str(); } - // Format as Chrome Tracing complete event (phase "X") - // Following DFTracer's format exactly: - // {"id":%d,"name":"%s","cat":"%s","pid":%d,"tid":%lu,"ts":%llu,"dur":%llu,"ph":"X","args":{...}} - written_size = std::snprintf( - buffer, - 16384, // Large buffer size to handle long strings - R"({"id":%d,"name":"%s","cat":"%s","pid":%u,"tid":%u,"ts":%llu,"dur":%llu,"ph":"X","args":{%s}})", - index, node.get_name().c_str(), node.get_category().c_str(), process_id, - thread_id, static_cast(node.get_start_time()), + auto nm = node.get_name(); + auto ct = node.get_category(); + size_t written_size = std::snprintf( + buffer, 16384, + R"({"id":%d,"name":"%.*s","cat":"%.*s","pid":%u,"tid":%u,"ts":%llu,"dur":%llu,"ph":"X","args":{%s}})", + index, static_cast(nm.size()), nm.data(), + static_cast(ct.size()), ct.data(), process_id, thread_id, + static_cast(node.get_start_time()), static_cast(node.get_duration()), all_args.str().c_str()); - // Add newline terminator if (written_size > 0) { buffer[written_size++] = '\n'; buffer[written_size] = '\0'; @@ -184,10 +165,6 @@ size_t JsonSerializer::serialize_metadata(char* buffer, const std::string& name, bool is_string) { size_t written_size = 0; - // Format metadata event (phase "M") - // Following DFTracer's format: - // {"name":"%s","cat":"dftracer","pid":%d,"tid":%lu,"ph":"M","args":{"hhash":"%s","name":"%s","value":"%s"}} - if (is_string) { written_size = std::snprintf( buffer, 8192, @@ -202,7 +179,6 @@ size_t JsonSerializer::serialize_metadata(char* buffer, const std::string& name, value.c_str()); } - // Add newline terminator if (written_size > 0) { buffer[written_size++] = '\n'; buffer[written_size] = '\0'; @@ -213,7 +189,6 @@ size_t JsonSerializer::serialize_metadata(char* buffer, const std::string& name, size_t JsonSerializer::finalize(char* buffer, bool write_bracket) { if (write_bracket) { - // Write closing bracket for JSON array buffer[0] = ']'; buffer[1] = '\n'; return 2; diff --git a/src/dftracer/utils/utilities/common/arrow/column_builder.cpp b/src/dftracer/utils/utilities/common/arrow/column_builder.cpp index 1f326820..5c1bd444 100644 --- a/src/dftracer/utils/utilities/common/arrow/column_builder.cpp +++ b/src/dftracer/utils/utilities/common/arrow/column_builder.cpp @@ -1,3 +1,4 @@ +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW #include @@ -23,6 +24,10 @@ ArrowType to_nanoarrow_type(ColumnType t) noexcept { return NANOARROW_TYPE_STRING; case ColumnType::BOOL: return NANOARROW_TYPE_BOOL; + case ColumnType::DICT_STRING: + // Dictionary uses INT32 indices; dictionary values handled + // separately + return NANOARROW_TYPE_INT32; } return NANOARROW_TYPE_UNINITIALIZED; } @@ -37,11 +42,15 @@ void RecordBatchBuilder::init_column(ColumnData& col, ColumnType type, col.has_nulls = false; } -void RecordBatchBuilder::backfill_nulls(ColumnData& col, size_t target_count) { - size_t n = target_count - col.count; +void RecordBatchBuilder::backfill_nulls(ColumnData& col, + std::size_t target_count) { + std::size_t n = target_count - col.count; if (n == 0) return; - col.has_nulls = true; + if (!col.has_nulls) { + col.has_nulls = true; + col.validity.assign(col.count, 1); + } col.validity.resize(col.count + n, 0); switch (col.type) { @@ -55,11 +64,16 @@ void RecordBatchBuilder::backfill_nulls(ColumnData& col, size_t target_count) { col.double_values.resize(col.count + n, 0.0); break; case ColumnType::STRING: - col.string_values.resize(col.count + n, std::string_view{}); + col.string_offsets.resize( + col.count + n, + static_cast(col.string_data.size())); break; case ColumnType::BOOL: col.bool_values.resize(col.count + n, 0); break; + case ColumnType::DICT_STRING: + col.dict_indices.resize(col.count + n, -1); // -1 = null + break; } col.count += n; } @@ -76,7 +90,7 @@ void RecordBatchBuilder::declare_schema(const std::vector& specs) { touched_.assign(specs.size(), false); for (const auto& spec : specs) { - size_t idx = columns_.size(); + std::size_t idx = columns_.size(); columns_.emplace_back(); init_column(columns_.back(), spec.type, spec.name); name_to_index_[spec.name] = idx; @@ -84,80 +98,125 @@ void RecordBatchBuilder::declare_schema(const std::vector& specs) { schema_declared_ = true; } -size_t RecordBatchBuilder::add_or_get_column(std::string_view name, - ColumnType type) { - auto it = name_to_index_.find(std::string(name)); +std::size_t RecordBatchBuilder::add_or_get_column(std::string_view name, + ColumnType type) { + auto it = name_to_index_.find(name); if (it != name_to_index_.end()) { // Existing column: type is ignored. Callers that need type-safe // appends should use find_column() + column_type() first. return it->second; } - size_t idx = columns_.size(); + std::size_t idx = columns_.size(); columns_.emplace_back(); init_column(columns_.back(), type, name); if (num_rows_ > 0) { backfill_nulls(columns_.back(), num_rows_); } - name_to_index_[std::string(name)] = idx; + name_to_index_.emplace(std::string(name), idx); touched_.push_back(false); return idx; } -std::optional RecordBatchBuilder::find_column( +std::optional RecordBatchBuilder::find_column( std::string_view name) const { - auto it = name_to_index_.find(std::string(name)); + auto it = name_to_index_.find(name); if (it != name_to_index_.end()) return it->second; return std::nullopt; } -ColumnType RecordBatchBuilder::column_type(size_t col_idx) const noexcept { +ColumnType RecordBatchBuilder::column_type(std::size_t col_idx) const noexcept { return columns_[col_idx].type; } -void RecordBatchBuilder::append_int64(size_t col_idx, int64_t value) { +void RecordBatchBuilder::append_int64(std::size_t col_idx, std::int64_t value) { auto& col = columns_[col_idx]; col.int64_values.push_back(value); - col.validity.push_back(1); + if (col.has_nulls) col.validity.push_back(1); ++col.count; - if (!schema_declared_) touched_[col_idx] = true; + if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) { + touched_[col_idx] = true; + ++row_touched_count_; + } } -void RecordBatchBuilder::append_uint64(size_t col_idx, uint64_t value) { +void RecordBatchBuilder::append_uint64(std::size_t col_idx, + std::uint64_t value) { auto& col = columns_[col_idx]; col.uint64_values.push_back(value); - col.validity.push_back(1); + if (col.has_nulls) col.validity.push_back(1); ++col.count; - if (!schema_declared_) touched_[col_idx] = true; + if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) { + touched_[col_idx] = true; + ++row_touched_count_; + } } -void RecordBatchBuilder::append_double(size_t col_idx, double value) { +void RecordBatchBuilder::append_double(std::size_t col_idx, double value) { auto& col = columns_[col_idx]; col.double_values.push_back(value); - col.validity.push_back(1); + if (col.has_nulls) col.validity.push_back(1); ++col.count; - if (!schema_declared_) touched_[col_idx] = true; + if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) { + touched_[col_idx] = true; + ++row_touched_count_; + } } -void RecordBatchBuilder::append_string(size_t col_idx, std::string_view value) { +void RecordBatchBuilder::append_string(std::size_t col_idx, + std::string_view value) { auto& col = columns_[col_idx]; - col.string_values.push_back(value); + col.string_data.insert(col.string_data.end(), value.begin(), value.end()); + col.string_offsets.push_back( + static_cast(col.string_data.size())); + if (col.has_nulls) col.validity.push_back(1); + ++col.count; + if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) { + touched_[col_idx] = true; + ++row_touched_count_; + } +} + +void RecordBatchBuilder::append_dict_string(std::size_t col_idx, + std::string_view value) { + auto& col = columns_[col_idx]; + // Look up or insert into dictionary + auto it = col.dict_map.find(value); + std::int32_t idx; + if (it != col.dict_map.end()) { + idx = it->second; + } else { + idx = static_cast(col.dict_values.size()); + col.dict_values.emplace_back(value); + // Map key must point to stable storage (dict_values) + col.dict_map[col.dict_values.back()] = idx; + } + col.dict_indices.push_back(idx); col.validity.push_back(1); ++col.count; - if (!schema_declared_) touched_[col_idx] = true; + if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) { + touched_[col_idx] = true; + ++row_touched_count_; + } } -void RecordBatchBuilder::append_bool(size_t col_idx, bool value) { +void RecordBatchBuilder::append_bool(std::size_t col_idx, bool value) { auto& col = columns_[col_idx]; col.bool_values.push_back(value ? 1 : 0); - col.validity.push_back(1); + if (col.has_nulls) col.validity.push_back(1); ++col.count; - if (!schema_declared_) touched_[col_idx] = true; + if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) { + touched_[col_idx] = true; + ++row_touched_count_; + } } -void RecordBatchBuilder::append_null(size_t col_idx) { +void RecordBatchBuilder::append_null(std::size_t col_idx) { auto& col = columns_[col_idx]; - col.has_nulls = true; + if (!col.has_nulls) { + col.has_nulls = true; + col.validity.assign(col.count, 1); + } col.validity.push_back(0); switch (col.type) { @@ -171,29 +230,44 @@ void RecordBatchBuilder::append_null(size_t col_idx) { col.double_values.push_back(0.0); break; case ColumnType::STRING: - col.string_values.push_back(std::string_view{}); + col.string_offsets.push_back( + static_cast(col.string_data.size())); break; case ColumnType::BOOL: col.bool_values.push_back(0); break; + case ColumnType::DICT_STRING: + col.dict_indices.push_back(-1); // -1 = null + break; } ++col.count; - if (!schema_declared_) touched_[col_idx] = true; + if (!schema_declared_ && !touched_[col_idx]) { + touched_[col_idx] = true; + ++row_touched_count_; + } } void RecordBatchBuilder::end_row() { - // Backfill nulls for any column not appended to this row. - // In dynamic mode, use touched_ flags; in static mode, compare counts. - for (size_t i = 0; i < columns_.size(); ++i) { - if (columns_[i].count <= num_rows_) { - backfill_nulls(columns_[i], num_rows_ + 1); + if (!schema_declared_ && !schema_locked_ && + row_touched_count_ == columns_.size()) { + std::fill(touched_.begin(), touched_.end(), false); + row_touched_count_ = 0; + ++num_rows_; + return; + } + const std::size_t target = num_rows_ + 1; + const bool reset_touched = !schema_declared_ && !schema_locked_; + for (std::size_t i = 0; i < columns_.size(); ++i) { + if (columns_[i].count < target) { + backfill_nulls(columns_[i], target); } - if (!schema_declared_) touched_[i] = false; + if (reset_touched) touched_[i] = false; } + row_touched_count_ = 0; ++num_rows_; } -void RecordBatchBuilder::reserve(size_t num_rows) { +void RecordBatchBuilder::reserve(std::size_t num_rows) { for (auto& col : columns_) { switch (col.type) { case ColumnType::INT64: @@ -206,19 +280,27 @@ void RecordBatchBuilder::reserve(size_t num_rows) { col.double_values.reserve(num_rows); break; case ColumnType::STRING: - col.string_values.reserve(num_rows); + col.string_offsets.reserve(num_rows + 1); + // dftracer hash strings are 16 bytes; common strings + // (event names, categories) range 4-32. Bumping the + // initial reservation cuts geometric-growth memmove churn + // visible in perf for moderate batch sizes. + col.string_data.reserve(num_rows * 32); break; case ColumnType::BOOL: col.bool_values.reserve(num_rows); break; + case ColumnType::DICT_STRING: + col.dict_indices.reserve(num_rows); + break; } col.validity.reserve(num_rows); } } ArrowExportResult RecordBatchBuilder::finish() { - const int64_t ncols = static_cast(columns_.size()); - const int64_t nrows = static_cast(num_rows_); + const std::int64_t ncols = static_cast(columns_.size()); + const std::int64_t nrows = static_cast(num_rows_); // Build schema: struct with one child per column. nanoarrow::UniqueSchema schema; @@ -229,12 +311,34 @@ ArrowExportResult RecordBatchBuilder::finish() { if (ArrowSchemaAllocateChildren(schema.get(), ncols) != NANOARROW_OK) { throw std::runtime_error("ArrowSchemaAllocateChildren failed"); } - for (int64_t i = 0; i < ncols; ++i) { - const auto& col = columns_[static_cast(i)]; + for (std::int64_t i = 0; i < ncols; ++i) { + const auto& col = columns_[static_cast(i)]; ArrowSchema* child_schema = schema->children[i]; - if (ArrowSchemaInitFromType( - child_schema, to_nanoarrow_type(col.type)) != NANOARROW_OK) { - throw std::runtime_error("ArrowSchemaInitFromType(child) failed"); + + if (col.type == ColumnType::DICT_STRING) { + // Dictionary-encoded string: indices are INT32, values are STRING + if (ArrowSchemaInitFromType(child_schema, NANOARROW_TYPE_INT32) != + NANOARROW_OK) { + throw std::runtime_error( + "ArrowSchemaInitFromType(dict indices) failed"); + } + if (ArrowSchemaAllocateDictionary(child_schema) != NANOARROW_OK) { + throw std::runtime_error( + "ArrowSchemaAllocateDictionary failed"); + } + if (ArrowSchemaInitFromType(child_schema->dictionary, + NANOARROW_TYPE_STRING) != + NANOARROW_OK) { + throw std::runtime_error( + "ArrowSchemaInitFromType(dict values) failed"); + } + } else { + if (ArrowSchemaInitFromType(child_schema, + to_nanoarrow_type(col.type)) != + NANOARROW_OK) { + throw std::runtime_error( + "ArrowSchemaInitFromType(child) failed"); + } } if (ArrowSchemaSetName(child_schema, col.name.c_str()) != NANOARROW_OK) { @@ -253,100 +357,158 @@ ArrowExportResult RecordBatchBuilder::finish() { throw std::runtime_error("ArrowArrayStartAppending failed"); } - for (int64_t i = 0; i < ncols; ++i) { - const auto& col = columns_[static_cast(i)]; + for (std::int64_t i = 0; i < ncols; ++i) { + const auto& col = columns_[static_cast(i)]; ArrowArray* child = array->children[i]; if (ArrowArrayReserve(child, nrows) != NANOARROW_OK) { throw std::runtime_error("ArrowArrayReserve failed"); } - // AppendNull handles validity bits internally. + const std::size_t row_count = + std::min(col.count, static_cast(nrows)); + std::int64_t null_count = 0; + + auto fill_validity = [&]() { + if (!col.has_nulls) return; + ArrowBitmap* bm = ArrowArrayValidityBitmap(child); + ArrowBitmapReserve(bm, static_cast(row_count)); + for (std::size_t r = 0; r < row_count; ++r) { + std::uint8_t v = col.validity[r]; + if (!v) ++null_count; + ArrowBitmapAppendUnsafe(bm, v, 1); + } + }; + switch (col.type) { - case ColumnType::INT64: - for (size_t r = 0; r < col.count; ++r) { - if (col.has_nulls && col.validity[r] == 0) { - if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) { - throw std::runtime_error( - "ArrowArrayAppendNull failed"); - } - } else { - if (ArrowArrayAppendInt(child, col.int64_values[r]) != - NANOARROW_OK) { - throw std::runtime_error( - "ArrowArrayAppendInt failed"); - } - } + case ColumnType::INT64: { + fill_validity(); + ArrowBuffer* data_buf = ArrowArrayBuffer(child, 1); + if (ArrowBufferAppend(data_buf, col.int64_values.data(), + static_cast( + row_count * sizeof(std::int64_t))) != + NANOARROW_OK) { + throw std::runtime_error("INT64 buffer append failed"); } break; - case ColumnType::UINT64: - for (size_t r = 0; r < col.count; ++r) { - if (col.has_nulls && col.validity[r] == 0) { - if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) { - throw std::runtime_error( - "ArrowArrayAppendNull failed"); - } - } else { - if (ArrowArrayAppendUInt(child, col.uint64_values[r]) != - NANOARROW_OK) { - throw std::runtime_error( - "ArrowArrayAppendUInt failed"); - } - } + } + case ColumnType::UINT64: { + fill_validity(); + ArrowBuffer* data_buf = ArrowArrayBuffer(child, 1); + if (ArrowBufferAppend(data_buf, col.uint64_values.data(), + static_cast( + row_count * sizeof(std::uint64_t))) != + NANOARROW_OK) { + throw std::runtime_error("UINT64 buffer append failed"); } break; - case ColumnType::DOUBLE: - for (size_t r = 0; r < col.count; ++r) { - if (col.has_nulls && col.validity[r] == 0) { - if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) { - throw std::runtime_error( - "ArrowArrayAppendNull failed"); - } - } else { - if (ArrowArrayAppendDouble( - child, col.double_values[r]) != NANOARROW_OK) { - throw std::runtime_error( - "ArrowArrayAppendDouble failed"); - } - } + } + case ColumnType::DOUBLE: { + fill_validity(); + ArrowBuffer* data_buf = ArrowArrayBuffer(child, 1); + if (ArrowBufferAppend(data_buf, col.double_values.data(), + static_cast( + row_count * sizeof(double))) != + NANOARROW_OK) { + throw std::runtime_error("DOUBLE buffer append failed"); } break; + } case ColumnType::STRING: { - for (size_t r = 0; r < col.count; ++r) { + fill_validity(); + ArrowBuffer* offsets_buf = ArrowArrayBuffer(child, 1); + ArrowBuffer* data_buf = ArrowArrayBuffer(child, 2); + if (row_count > 0) { + ArrowBufferReserve(offsets_buf, + row_count * sizeof(std::int32_t)); + ArrowBufferAppend(offsets_buf, col.string_offsets.data(), + static_cast( + row_count * sizeof(std::int32_t))); + } + if (!col.string_data.empty()) { + ArrowBufferAppend( + data_buf, col.string_data.data(), + static_cast(col.string_data.size())); + } + break; + } + case ColumnType::BOOL: { + for (std::size_t r = 0; r < row_count; ++r) { if (col.has_nulls && col.validity[r] == 0) { if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) { throw std::runtime_error( - "ArrowArrayAppendNull failed"); + "ArrowArrayAppendNull(bool) failed"); } + ++null_count; } else { - std::string_view sv = col.string_values[r]; - ArrowStringView asv{sv.data(), - static_cast(sv.size())}; - if (ArrowArrayAppendString(child, asv) != + if (ArrowArrayAppendInt(child, col.bool_values[r]) != NANOARROW_OK) { throw std::runtime_error( - "ArrowArrayAppendString failed"); + "ArrowArrayAppendInt(bool) failed"); } } } break; } - case ColumnType::BOOL: - for (size_t r = 0; r < col.count; ++r) { + case ColumnType::DICT_STRING: { + // Build indices array (INT32) + for (std::size_t r = 0; r < col.count; ++r) { if (col.has_nulls && col.validity[r] == 0) { if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) { throw std::runtime_error( - "ArrowArrayAppendNull failed"); + "ArrowArrayAppendNull(dict) failed"); } } else { - if (ArrowArrayAppendInt(child, col.bool_values[r]) != + if (ArrowArrayAppendInt(child, col.dict_indices[r]) != NANOARROW_OK) { throw std::runtime_error( - "ArrowArrayAppendInt(bool) failed"); + "ArrowArrayAppendInt(dict index) failed"); } } } + + // Build dictionary array (STRING) + // Allocate dictionary array + child->dictionary = + static_cast(ArrowMalloc(sizeof(ArrowArray))); + if (!child->dictionary) { + throw std::runtime_error("Failed to allocate dictionary"); + } + ArrowArrayInitFromType(child->dictionary, + NANOARROW_TYPE_STRING); + if (ArrowArrayStartAppending(child->dictionary) != + NANOARROW_OK) { + throw std::runtime_error( + "ArrowArrayStartAppending(dict) failed"); + } + if (ArrowArrayReserve( + child->dictionary, + static_cast(col.dict_values.size())) != + NANOARROW_OK) { + throw std::runtime_error("ArrowArrayReserve(dict) failed"); + } + for (const auto& s : col.dict_values) { + ArrowStringView asv{s.data(), + static_cast(s.size())}; + if (ArrowArrayAppendString(child->dictionary, asv) != + NANOARROW_OK) { + throw std::runtime_error( + "ArrowArrayAppendString(dict) failed"); + } + } + if (ArrowArrayFinishBuildingDefault(child->dictionary, + nullptr) != NANOARROW_OK) { + throw std::runtime_error( + "ArrowArrayFinishBuildingDefault(dict) failed"); + } break; + } + } + + if (col.type == ColumnType::INT64 || col.type == ColumnType::UINT64 || + col.type == ColumnType::DOUBLE || col.type == ColumnType::STRING) { + child->length = static_cast(row_count); + child->null_count = col.has_nulls ? null_count : 0; } if (ArrowArrayFinishBuildingDefault(child, nullptr) != NANOARROW_OK) { @@ -367,22 +529,30 @@ ArrowExportResult RecordBatchBuilder::finish() { } void RecordBatchBuilder::reset(bool keep_schema) { - if (keep_schema && schema_declared_) { + // Keep schema if explicitly declared OR if dynamically locked + if (keep_schema && (schema_declared_ || schema_locked_)) { for (auto& col : columns_) { col.int64_values.clear(); col.uint64_values.clear(); col.double_values.clear(); - col.string_values.clear(); + col.string_offsets.clear(); + col.string_data.clear(); col.bool_values.clear(); + col.dict_indices.clear(); + col.dict_values.clear(); + col.dict_map.clear(); col.validity.clear(); col.count = 0; col.has_nulls = false; } + // Reset touched flags but keep the vector size + std::fill(touched_.begin(), touched_.end(), false); } else { columns_.clear(); name_to_index_.clear(); touched_.clear(); schema_declared_ = false; + schema_locked_ = false; } num_rows_ = 0; } diff --git a/src/dftracer/utils/utilities/common/arrow/ipc_reader.cpp b/src/dftracer/utils/utilities/common/arrow/ipc_reader.cpp new file mode 100644 index 00000000..30b39673 --- /dev/null +++ b/src/dftracer/utils/utilities/common/arrow/ipc_reader.cpp @@ -0,0 +1,355 @@ +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#include +#include +#include + +#include +#include + +// Platform-specific includes for mmap +#ifdef _WIN32 +#include +#else +#include +#include +#include +#include +#endif + +namespace dftracer::utils::utilities::common::arrow { + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +static ArrowIpcDecoder* as_decoder(void* p) noexcept { + return static_cast(p); +} + +// --------------------------------------------------------------------------- +// Lifecycle +// --------------------------------------------------------------------------- + +IpcReader::~IpcReader() { close(); } + +IpcReader::IpcReader(IpcReader&& other) noexcept + : mapped_data_(other.mapped_data_), + mapped_size_(other.mapped_size_), + fd_(other.fd_), + decoder_(other.decoder_), + shared_schema_(std::move(other.shared_schema_)), + blocks_(std::move(other.blocks_)), + num_batches_(other.num_batches_), + total_rows_(other.total_rows_) { + other.reset_state(); +} + +IpcReader& IpcReader::operator=(IpcReader&& other) noexcept { + if (this != &other) { + close(); + mapped_data_ = other.mapped_data_; + mapped_size_ = other.mapped_size_; + fd_ = other.fd_; + decoder_ = other.decoder_; + shared_schema_ = std::move(other.shared_schema_); + blocks_ = std::move(other.blocks_); + num_batches_ = other.num_batches_; + total_rows_ = other.total_rows_; + other.reset_state(); + } + return *this; +} + +void IpcReader::reset_state() noexcept { + mapped_data_ = nullptr; + mapped_size_ = 0; + fd_ = -1; + decoder_ = nullptr; + shared_schema_.reset(); + blocks_.clear(); + num_batches_ = 0; + total_rows_ = 0; +} + +void IpcReader::close() { + if (decoder_) { + ArrowIpcDecoderReset(as_decoder(decoder_)); + delete as_decoder(decoder_); + decoder_ = nullptr; + } + + shared_schema_.reset(); + +#ifdef _WIN32 + if (mapped_data_) { + UnmapViewOfFile(mapped_data_); + } + if (fd_ != -1) { + CloseHandle(reinterpret_cast(fd_)); + } +#else + if (mapped_data_ && mapped_data_ != MAP_FAILED) { + munmap(mapped_data_, mapped_size_); + } + if (fd_ != -1) { + ::close(fd_); + } +#endif + + reset_state(); +} + +// --------------------------------------------------------------------------- +// open / read_footer +// --------------------------------------------------------------------------- + +int IpcReader::open(const std::string& path) { + if (is_open()) return -1; + +#ifdef _WIN32 + // Windows memory mapping + HANDLE file = + CreateFileA(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); + if (file == INVALID_HANDLE_VALUE) return -1; + + LARGE_INTEGER size; + if (!GetFileSizeEx(file, &size)) { + CloseHandle(file); + return -1; + } + mapped_size_ = static_cast(size.QuadPart); + + HANDLE mapping = + CreateFileMappingA(file, nullptr, PAGE_READONLY, 0, 0, nullptr); + if (!mapping) { + CloseHandle(file); + return -1; + } + + mapped_data_ = MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0); + CloseHandle(mapping); + if (!mapped_data_) { + CloseHandle(file); + return -1; + } + fd_ = reinterpret_cast(file); +#else + // POSIX memory mapping + fd_ = ::open(path.c_str(), O_RDONLY); + if (fd_ < 0) return -1; + + struct stat st; + if (fstat(fd_, &st) < 0) { + ::close(fd_); + fd_ = -1; + return -1; + } + mapped_size_ = static_cast(st.st_size); + + // Minimum Arrow IPC file: magic(8) + footer_size(4) + magic(6) = 18 bytes + if (mapped_size_ < 18) { + ::close(fd_); + fd_ = -1; + return -1; + } + + mapped_data_ = mmap(nullptr, mapped_size_, PROT_READ, MAP_PRIVATE, fd_, 0); + if (mapped_data_ == MAP_FAILED) { + ::close(fd_); + fd_ = -1; + mapped_data_ = nullptr; + return -1; + } + + // Advise kernel we'll read sequentially + madvise(mapped_data_, mapped_size_, MADV_SEQUENTIAL); +#endif + + int rc = read_footer(); + if (rc != NANOARROW_OK) { + close(); + return rc; + } + + return NANOARROW_OK; +} + +int IpcReader::read_footer() { + auto* data = static_cast(mapped_data_); + + // Arrow IPC file format footer: + // ... | footer | footer_size (4 bytes) | "ARROW1" (6 bytes) + + // Validate magic at end + if (std::memcmp(data + mapped_size_ - 6, "ARROW1", 6) != 0) { + return -1; + } + + // Read footer size (4 bytes before magic) + std::int32_t footer_size; + std::memcpy(&footer_size, data + mapped_size_ - 10, sizeof(footer_size)); + + // Calculate footer bounds + std::int64_t footer_total_size = + footer_size + 10; // footer + size(4) + magic(6) + std::int64_t footer_offset = mapped_size_ - footer_total_size; + if (footer_offset < 8) { // Must be after file magic + return -1; + } + + // Initialize decoder + auto* decoder = new (std::nothrow) ArrowIpcDecoder; + if (!decoder) return -1; + std::memset(decoder, 0, sizeof(ArrowIpcDecoder)); + + int rc = ArrowIpcDecoderInit(decoder); + if (rc != NANOARROW_OK) { + delete decoder; + return rc; + } + decoder_ = decoder; + + // Decode footer directly from mmap'd memory (zero-copy) + ArrowBufferView footer_view; + footer_view.data.as_uint8 = data + footer_offset; + footer_view.size_bytes = footer_total_size; + + ArrowError error; + rc = ArrowIpcDecoderVerifyFooter(decoder, footer_view, &error); + if (rc != NANOARROW_OK) { + return rc; + } + + rc = ArrowIpcDecoderDecodeFooter(decoder, footer_view, &error); + if (rc != NANOARROW_OK) { + return rc; + } + + // Footer is now available at decoder->footer + ArrowIpcFooter* footer = decoder->footer; + + // Copy block info - decoder state may be modified by subsequent operations + num_batches_ = + footer->record_batch_blocks.size_bytes / sizeof(ArrowIpcFileBlock); + blocks_.resize(num_batches_); + auto* src_blocks = reinterpret_cast( + footer->record_batch_blocks.data); + for (std::size_t i = 0; i < num_batches_; ++i) { + blocks_[i].offset = src_blocks[i].offset; + blocks_[i].metadata_length = src_blocks[i].metadata_length; + blocks_[i].body_length = src_blocks[i].body_length; + } + + // Create shared schema - deep copy once, share for all batches + auto* schema = new (std::nothrow) ArrowSchema; + if (!schema) return -1; + std::memset(schema, 0, sizeof(ArrowSchema)); + rc = ArrowSchemaDeepCopy(&footer->schema, schema); + if (rc != NANOARROW_OK) { + delete schema; + return rc; + } + + // Wrap in shared_ptr with custom deleter + shared_schema_ = std::shared_ptr(schema, [](void* p) { + auto* s = static_cast(p); + if (s->release) s->release(s); + delete s; + }); + + // Set decoder's expected schema + rc = ArrowIpcDecoderSetSchema(decoder, &footer->schema, &error); + if (rc != NANOARROW_OK) { + return rc; + } + + return NANOARROW_OK; +} + +// --------------------------------------------------------------------------- +// read_batch +// --------------------------------------------------------------------------- + +ArrowExportResult IpcReader::read_batch(std::size_t index) { + if (!is_open() || index >= num_batches_) { + return ArrowExportResult(); + } + + auto* decoder = as_decoder(decoder_); + auto* data = static_cast(mapped_data_); + const auto& block = blocks_[index]; + + // Zero-copy: point directly into mmap'd memory for header + ArrowBufferView header_view; + header_view.data.as_uint8 = data + block.offset; + header_view.size_bytes = block.metadata_length; + + ArrowError error; + int rc = ArrowIpcDecoderDecodeHeader(decoder, header_view, &error); + if (rc != NANOARROW_OK) { + return ArrowExportResult(); + } + + // Zero-copy: point directly into mmap'd memory for body + ArrowBufferView body_view; + body_view.data.as_uint8 = data + block.offset + block.metadata_length; + body_view.size_bytes = block.body_length; + + // Decode array + nanoarrow::UniqueArray array; + rc = ArrowIpcDecoderDecodeArray(decoder, body_view, -1, array.get(), + NANOARROW_VALIDATION_LEVEL_FULL, &error); + if (rc != NANOARROW_OK) { + return ArrowExportResult(); + } + + // Share schema instead of deep copying + // We need to create a new ArrowSchema that references our shared one + auto* schema_ptr = static_cast(shared_schema_.get()); + nanoarrow::UniqueSchema schema; + rc = ArrowSchemaDeepCopy(schema_ptr, schema.get()); + if (rc != NANOARROW_OK) { + return ArrowExportResult(); + } + + return ArrowExportResult(std::move(schema), std::move(array)); +} + +// --------------------------------------------------------------------------- +// read_all / for_each_batch +// --------------------------------------------------------------------------- + +std::vector IpcReader::read_all() { + std::vector results; + results.reserve(num_batches_); + + for (std::size_t i = 0; i < num_batches_; ++i) { + auto batch = read_batch(i); + if (batch.valid()) { + results.push_back(std::move(batch)); + } + } + + return results; +} + +int IpcReader::for_each_batch(std::function callback) { + for (std::size_t i = 0; i < num_batches_; ++i) { + auto batch = read_batch(i); + if (!batch.valid()) { + return -1; + } + int rc = callback(batch); + if (rc != 0) { + return rc; + } + } + return 0; +} + +} // namespace dftracer::utils::utilities::common::arrow + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC diff --git a/src/dftracer/utils/utilities/common/arrow/ipc_writer.cpp b/src/dftracer/utils/utilities/common/arrow/ipc_writer.cpp index 871d85f0..8ee4cb4e 100644 --- a/src/dftracer/utils/utilities/common/arrow/ipc_writer.cpp +++ b/src/dftracer/utils/utilities/common/arrow/ipc_writer.cpp @@ -1,199 +1,719 @@ +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +#include +#include #include +#include +#include +#include #include #include +#include +#include + +#ifdef DFTRACER_UTILS_ENABLE_ZSTD +#include +#endif -#include #include #include +#include + +#define ns(x) FLATBUFFERS_WRAP_NAMESPACE(org_apache_arrow_flatbuf, x) namespace dftracer::utils::utilities::common::arrow { +using FileBlock = ArrowIpcFileBlock; + +// --------------------------------------------------------------------------- +// BufferPool +// --------------------------------------------------------------------------- + +BufferPool::BufferPool(std::size_t num_slots, std::size_t initial_capacity) { + slots_.reserve(num_slots); + for (std::size_t i = 0; i < num_slots; ++i) { + auto slot = std::make_unique(); + slot->data.reserve(initial_capacity); + slots_.push_back(std::move(slot)); + } +} + +BufferPool::Slot* BufferPool::acquire(std::size_t min_capacity) { + for (auto& slot : slots_) { + bool expected = false; + if (slot->in_use.compare_exchange_strong(expected, true, + std::memory_order_acquire)) { + if (slot->data.capacity() < min_capacity) { + slot->data.reserve(min_capacity); + } + slot->data.clear(); + return slot.get(); + } + } + return nullptr; // All slots in use +} + +void BufferPool::release(Slot* slot) { + if (slot) { + slot->in_use.store(false, std::memory_order_release); + } +} + // --------------------------------------------------------------------------- -// Helpers +// Compression helpers // --------------------------------------------------------------------------- -static ArrowIpcWriter* as_writer(void* p) noexcept { - return static_cast(p); +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + +struct BufferInfo { + std::int64_t offset; + std::int64_t length; +}; + +static void collect_flat_buffers(ArrowArrayView* view, + std::vector& out, + bool is_root = false) { + if (!is_root) { + std::int64_t num_buffers = ArrowArrayViewGetNumBuffers(view); + for (std::int64_t i = 0; i < num_buffers; i++) { + out.push_back(ArrowArrayViewGetBufferView(view, i)); + } + } + for (std::int64_t i = 0; i < view->n_children; i++) { + collect_flat_buffers(view->children[i], out, false); + } +} + +static void collect_nodes(const ArrowArrayView* view, + std::vector& nodes, + bool is_root = false) { + if (!is_root) { + ns(FieldNode_t) node; + node.length = view->length; + node.null_count = ArrowArrayViewComputeNullCount(view); + nodes.push_back(node); + } + for (std::int64_t i = 0; i < view->n_children; i++) { + collect_nodes(view->children[i], nodes, false); + } +} + +static int build_compressed_body(ArrowArrayView* view, + std::vector& out_body, + std::vector& out_info) { + std::vector flat_buffers; + collect_flat_buffers(view, flat_buffers, true); + + out_body.clear(); + out_info.clear(); + std::int64_t compressed_offset = 0; + + for (const auto& buf : flat_buffers) { + std::int64_t uncompressed_size = buf.size_bytes; + + if (uncompressed_size == 0 || buf.data.data == nullptr) { + out_info.push_back({compressed_offset, 0}); + continue; + } + + std::size_t max_compressed = ZSTD_compressBound(uncompressed_size); + std::size_t old_size = out_body.size(); + + // Reserve space: [int64 uncompressed_size][zstd data] + out_body.resize(old_size + 8 + max_compressed); + + std::size_t compressed_size = + ZSTD_compress(out_body.data() + old_size + 8, max_compressed, + buf.data.data, uncompressed_size, 3); + + if (ZSTD_isError(compressed_size)) { + return -1; + } + + std::int64_t total_size = + 8 + static_cast(compressed_size); + out_body.resize(old_size + total_size); + std::memcpy(out_body.data() + old_size, &uncompressed_size, 8); + + out_info.push_back({compressed_offset, total_size}); + + compressed_offset += total_size; + std::int64_t padded = (compressed_offset + 7) & ~7; + out_body.resize(padded, 0); + compressed_offset = padded; + } + + return 0; } -static ArrowIpcOutputStream* as_stream(void* p) noexcept { - return static_cast(p); +static int build_message_header(ArrowArrayView* view, + const std::vector& buffer_info, + std::int64_t body_length, + std::vector& out_header) { + std::vector nodes; + collect_nodes(view, nodes, true); + + flatcc_builder_t builder; + if (flatcc_builder_init(&builder) == -1) { + return -1; + } + flatcc_builder_set_vtable_clustering(&builder, 0); + + std::vector buffer_structs; + buffer_structs.reserve(buffer_info.size()); + for (const auto& buf : buffer_info) { + ns(Buffer_t) b; + b.offset = buf.offset; + b.length = buf.length; + buffer_structs.push_back(b); + } + + ns(BodyCompression_ref_t) compression_ref = ns(BodyCompression_create( + &builder, ns(CompressionType_ZSTD), ns(BodyCompressionMethod_BUFFER))); + + ns(Message_start_as_root(&builder)); + ns(Message_version_add(&builder, ns(MetadataVersion_V5))); + ns(Message_header_RecordBatch_start(&builder)); + ns(RecordBatch_length_add(&builder, view->length)); + ns(RecordBatch_nodes_create( + &builder, reinterpret_cast(nodes.data()), + nodes.size())); + ns(RecordBatch_buffers_create(&builder, buffer_structs.data(), + buffer_structs.size())); + ns(RecordBatch_compression_add(&builder, compression_ref)); + ns(Message_header_RecordBatch_end(&builder)); + ns(Message_bodyLength_add(&builder, body_length)); + ns(Message_end_as_root(&builder)); + + std::size_t msg_size = 0; + void* msg_buf = flatcc_builder_get_direct_buffer(&builder, &msg_size); + void* allocated_buf = nullptr; + + if (!msg_buf) { + msg_buf = flatcc_builder_finalize_buffer(&builder, &msg_size); + allocated_buf = msg_buf; + } + + if (!msg_buf || msg_size == 0) { + if (allocated_buf) flatcc_builder_free(allocated_buf); + flatcc_builder_clear(&builder); + return -1; + } + + // Build IPC encapsulated message: continuation(-1) + size + metadata + + // padding + std::int32_t continuation = -1; + std::int32_t msg_size_i32 = static_cast(msg_size); + std::size_t msg_padding = (8 - (msg_size % 8)) % 8; + + out_header.clear(); + out_header.resize(8 + msg_size + msg_padding); + std::memcpy(out_header.data(), &continuation, 4); + std::memcpy(out_header.data() + 4, &msg_size_i32, 4); + std::memcpy(out_header.data() + 8, msg_buf, msg_size); + // Padding bytes are already zero from resize + + if (allocated_buf) flatcc_builder_free(allocated_buf); + flatcc_builder_clear(&builder); + + return 0; } +#endif // DFTRACER_UTILS_ENABLE_ZSTD + // --------------------------------------------------------------------------- -// Lifecycle +// IpcWriter lifecycle // --------------------------------------------------------------------------- IpcWriter::~IpcWriter() { if (is_open()) { - close(); + // Sync close in destructor - not ideal but safe + if (fd_ >= 0) { + ::close(fd_); + } + reset_state(); } } -IpcWriter::IpcWriter(IpcWriter&& other) noexcept - : file_(other.file_), - schema_written_(other.schema_written_), - writer_(other.writer_), - stream_(other.stream_) { +IpcWriter::IpcWriter(IpcWriter&& other) noexcept { + fd_ = other.fd_; + write_offset_ = other.write_offset_; + buffer_pool_ = std::move(other.buffer_pool_); + schema_written_ = other.schema_written_; + compression_ = other.compression_; + batch_blocks_ = other.batch_blocks_; + schema_copy_ = other.schema_copy_; other.reset_state(); } IpcWriter& IpcWriter::operator=(IpcWriter&& other) noexcept { if (this != &other) { - if (is_open()) close(); - file_ = other.file_; + if (fd_ >= 0) ::close(fd_); + fd_ = other.fd_; + write_offset_ = other.write_offset_; + buffer_pool_ = std::move(other.buffer_pool_); schema_written_ = other.schema_written_; - writer_ = other.writer_; - stream_ = other.stream_; + compression_ = other.compression_; + batch_blocks_ = other.batch_blocks_; + schema_copy_ = other.schema_copy_; other.reset_state(); } return *this; } void IpcWriter::reset_state() noexcept { - file_ = nullptr; - writer_ = nullptr; - stream_ = nullptr; + fd_ = -1; + write_offset_ = 0; schema_written_ = false; + batch_blocks_ = nullptr; + schema_copy_ = nullptr; } // --------------------------------------------------------------------------- // open // --------------------------------------------------------------------------- -int IpcWriter::open(const std::string& path) { - if (is_open()) return -1; +coro::CoroTask IpcWriter::open(const std::string& path, + IpcCompression compression, + std::size_t pool_slots) { + if (is_open()) co_return -1; - file_ = std::fopen(path.c_str(), "wb"); - if (!file_) return -1; + compression_ = compression; + buffer_pool_ = BufferPool(pool_slots); - // Allocate stream. ArrowIpcWriterInit takes ownership on success. - auto* os = new (std::nothrow) ArrowIpcOutputStream; - if (!os) { - std::fclose(file_); - file_ = nullptr; - return -1; + // Async open via io::open (auto-detects executor context) + auto result = + co_await io::open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + + if (result < 0) { + reset_state(); + co_return static_cast(result); + } + + fd_ = static_cast(result); + write_offset_ = 0; + + // Write Arrow IPC file magic "ARROW1" + padding + constexpr char MAGIC[] = "ARROW1\0\0"; + auto write_result = co_await io::pwrite(fd_, MAGIC, 8, 0); + if (write_result < 0) { + co_await io::close(fd_); + reset_state(); + co_return static_cast(write_result); + } + write_offset_ = 8; + + // Initialize block tracking + batch_blocks_ = new std::vector(); + + co_return 0; +} + +// --------------------------------------------------------------------------- +// write_schema +// --------------------------------------------------------------------------- + +coro::CoroTask IpcWriter::write_schema(ArrowExportResult& batch) { + ArrowSchema* schema = batch.get_schema(); + + // Deep copy schema for footer + auto* schema_cp = new ArrowSchema; + ArrowSchemaDeepCopy(schema, schema_cp); + schema_copy_ = schema_cp; + + // Encode schema message + ArrowIpcEncoder encoder; + ArrowIpcEncoderInit(&encoder); + + ArrowError error; + int rc = ArrowIpcEncoderEncodeSchema(&encoder, schema, &error); + if (rc != NANOARROW_OK) { + ArrowIpcEncoderReset(&encoder); + co_return rc; + } + + ArrowBuffer msg_buf; + ArrowBufferInit(&msg_buf); + rc = ArrowIpcEncoderFinalizeBuffer(&encoder, 1, &msg_buf); + ArrowIpcEncoderReset(&encoder); + if (rc != NANOARROW_OK) { + ArrowBufferReset(&msg_buf); + co_return rc; + } + + // Write schema message + auto result = co_await io::pwrite(fd_, msg_buf.data, msg_buf.size_bytes, + write_offset_); + + if (result < 0) { + ArrowBufferReset(&msg_buf); + co_return static_cast(result); } - std::memset(os, 0, sizeof(ArrowIpcOutputStream)); - // close_on_release=0: we manage the FILE* ourselves. - int rc = ArrowIpcOutputStreamInitFile(os, file_, /*close_on_release=*/0); + write_offset_ += msg_buf.size_bytes; + ArrowBufferReset(&msg_buf); + schema_written_ = true; + + co_return 0; +} + +// --------------------------------------------------------------------------- +// encode_batch_uncompressed +// --------------------------------------------------------------------------- + +static int encode_batch_uncompressed(ArrowExportResult& batch, + std::vector& out_header, + std::vector& out_body) { + ArrowSchema* schema = batch.get_schema(); + ArrowArray* array = batch.get_array(); + + ArrowArrayView view; + ArrowError error; + int rc = ArrowArrayViewInitFromSchema(&view, schema, &error); if (rc != NANOARROW_OK) { - delete os; - std::fclose(file_); - file_ = nullptr; return rc; } - auto* w = new (std::nothrow) ArrowIpcWriter; - if (!w) { - // os not yet consumed — release it manually - if (os->release) os->release(os); - delete os; - std::fclose(file_); - file_ = nullptr; - return -1; + rc = ArrowArrayViewSetArray(&view, array, &error); + if (rc != NANOARROW_OK) { + ArrowArrayViewReset(&view); + return rc; } - std::memset(w, 0, sizeof(ArrowIpcWriter)); - // ArrowIpcWriterInit takes ownership of *os (moves it internally). - rc = ArrowIpcWriterInit(w, os); + ArrowIpcEncoder encoder; + ArrowIpcEncoderInit(&encoder); + + ArrowBuffer body_buf; + ArrowBufferInit(&body_buf); + + rc = ArrowIpcEncoderEncodeSimpleRecordBatch(&encoder, &view, &body_buf, + &error); if (rc != NANOARROW_OK) { - // Init failed: writer did not take ownership, release stream ourselves. - if (os->release) os->release(os); - delete os; - delete w; - std::fclose(file_); - file_ = nullptr; + ArrowBufferReset(&body_buf); + ArrowIpcEncoderReset(&encoder); + ArrowArrayViewReset(&view); return rc; } - // Keep os pointer so we can delete the allocation in close(). - // The writer owns the stream contents; we only own the heap allocation. - stream_ = os; - writer_ = w; + ArrowBuffer header_buf; + ArrowBufferInit(&header_buf); - // Write IPC file magic. - rc = ArrowIpcWriterStartFile(w, nullptr); + rc = ArrowIpcEncoderFinalizeBuffer(&encoder, 1, &header_buf); if (rc != NANOARROW_OK) { - ArrowIpcWriterReset(w); - delete w; - delete os; - std::fclose(file_); - reset_state(); + ArrowBufferReset(&body_buf); + ArrowBufferReset(&header_buf); + ArrowIpcEncoderReset(&encoder); + ArrowArrayViewReset(&view); return rc; } - return NANOARROW_OK; + out_header.resize(header_buf.size_bytes); + std::memcpy(out_header.data(), header_buf.data, header_buf.size_bytes); + + out_body.resize(body_buf.size_bytes); + std::memcpy(out_body.data(), body_buf.data, body_buf.size_bytes); + + ArrowBufferReset(&body_buf); + ArrowBufferReset(&header_buf); + ArrowIpcEncoderReset(&encoder); + ArrowArrayViewReset(&view); + + return 0; } // --------------------------------------------------------------------------- -// write_batch +// compress_batch // --------------------------------------------------------------------------- -int IpcWriter::write_batch(ArrowExportResult& batch) { - if (!is_open() || !batch.valid()) return -1; +coro::CoroTask IpcWriter::compress_batch( + ArrowExportResult& batch) { + CompressedBatch result{}; + + if (compression_ == IpcCompression::NONE) { + result.body_slot = buffer_pool_.acquire(64 * 1024); + if (!result.body_slot) { + result.body_slot = new BufferPool::Slot(); + } + + int rc = encode_batch_uncompressed(batch, result.header, + result.body_slot->data); + if (rc != 0) { + co_return result; + } + + result.body_size = result.body_slot->data.size(); + result.body_length = static_cast(result.body_size); + result.metadata_length = + static_cast(result.header.size()); + co_return result; + } + +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + if (compression_ != IpcCompression::ZSTD) { + co_return result; + } - ArrowIpcWriter* w = as_writer(writer_); ArrowSchema* schema = batch.get_schema(); ArrowArray* array = batch.get_array(); - // Write schema once, before the first record batch. - if (!schema_written_) { - int rc = ArrowIpcWriterWriteSchema(w, schema, nullptr); - if (rc != NANOARROW_OK) return rc; - schema_written_ = true; - } - - // Build an ArrowArrayView from the schema + array. ArrowArrayView view; ArrowError error; int rc = ArrowArrayViewInitFromSchema(&view, schema, &error); if (rc != NANOARROW_OK) { ArrowArrayViewReset(&view); - return rc; + co_return result; } rc = ArrowArrayViewSetArray(&view, array, &error); if (rc != NANOARROW_OK) { ArrowArrayViewReset(&view); - return rc; + co_return result; + } + + // Acquire pooled buffer for compressed body + std::size_t estimated_size = 0; + std::vector flat_buffers; + collect_flat_buffers(&view, flat_buffers, true); + for (const auto& buf : flat_buffers) { + estimated_size += ZSTD_compressBound(buf.size_bytes) + 8; + } + + result.body_slot = buffer_pool_.acquire(estimated_size); + if (!result.body_slot) { + result.body_slot = new BufferPool::Slot(); + result.body_slot->data.reserve(estimated_size); + } + + // Compress into pooled buffer + std::vector buffer_info; + rc = build_compressed_body(&view, result.body_slot->data, buffer_info); + if (rc != 0) { + ArrowArrayViewReset(&view); + co_return result; + } + + result.body_size = result.body_slot->data.size(); + result.body_length = static_cast(result.body_size); + + // Build message header + rc = build_message_header(&view, buffer_info, result.body_length, + result.header); + if (rc != 0) { + ArrowArrayViewReset(&view); + co_return result; } - rc = ArrowIpcWriterWriteArrayView(w, &view, nullptr); + result.metadata_length = static_cast(result.header.size()); + ArrowArrayViewReset(&view); - return rc; +#endif + + co_return result; } // --------------------------------------------------------------------------- -// close +// write_compressed // --------------------------------------------------------------------------- -int IpcWriter::close() { - if (!is_open()) return 0; +coro::CoroTask IpcWriter::write_compressed(CompressedBatch& cb) { + auto* blocks = static_cast*>(batch_blocks_); - int rc = NANOARROW_OK; - ArrowIpcWriter* w = as_writer(writer_); + // Record block info + FileBlock block; + block.offset = write_offset_; + block.metadata_length = cb.metadata_length; + block.body_length = cb.body_length; - if (w && schema_written_) { - rc = ArrowIpcWriterFinalizeFile(w, nullptr); + // Vectored write: header + body + struct iovec iov[2]; + iov[0].iov_base = cb.header.data(); + iov[0].iov_len = cb.header.size(); + iov[1].iov_base = cb.body_slot->data.data(); + iov[1].iov_len = cb.body_size; + + auto result = co_await io::pwritev(fd_, iov, 2, write_offset_); + + // Release pooled buffer + buffer_pool_.release(cb.body_slot); + cb.body_slot = nullptr; + + if (result < 0) { + co_return static_cast(result); } - if (w) { - ArrowIpcWriterReset(w); - delete w; + write_offset_ += result; + blocks->push_back(block); + + co_return 0; +} + +// --------------------------------------------------------------------------- +// write_batch +// --------------------------------------------------------------------------- + +coro::CoroTask IpcWriter::write_batch(ArrowExportResult& batch) { + if (!is_open() || !batch.valid()) co_return -1; + + // Write schema on first batch + if (!schema_written_) { + int rc = co_await write_schema(batch); + if (rc != 0) co_return rc; } - // The stream allocation is ours; its contents were released by Reset. - if (stream_) { - delete as_stream(stream_); + // Compress and write + auto cb = co_await compress_batch(batch); + if (cb.header.empty()) co_return -1; + + co_return co_await write_compressed(cb); +} + +// --------------------------------------------------------------------------- +// write_batches (parallel compression) +// --------------------------------------------------------------------------- + +coro::CoroTask IpcWriter::write_batches( + std::vector& batches) { + if (!is_open()) co_return -1; + if (batches.empty()) co_return 0; + + // Write schema from first batch + if (!schema_written_) { + int rc = co_await write_schema(batches[0]); + if (rc != 0) co_return rc; + } + + // Parallel compress all batches + std::vector> compress_tasks; + compress_tasks.reserve(batches.size()); + + for (auto& batch : batches) { + if (batch.valid()) { + compress_tasks.push_back(compress_batch(batch)); + } + } + + auto compressed = co_await coro::when_all(std::move(compress_tasks)); + + // Write in order (sequential to maintain file structure) + for (auto& cb : compressed) { + if (cb.header.empty()) { + co_return -1; + } + int rc = co_await write_compressed(cb); + if (rc != 0) co_return rc; + } + + co_return 0; +} + +// --------------------------------------------------------------------------- +// write_footer +// --------------------------------------------------------------------------- + +coro::CoroTask IpcWriter::write_footer() { + auto* blocks = static_cast*>(batch_blocks_); + auto* schema = static_cast(schema_copy_); + + if (!blocks || !schema) { + co_return -1; + } + + ArrowIpcFooter footer; + ArrowIpcFooterInit(&footer); + ArrowSchemaMove(schema, &footer.schema); + + for (const auto& block : *blocks) { + int rc = ArrowBufferAppend(&footer.record_batch_blocks, &block, + sizeof(FileBlock)); + if (rc != NANOARROW_OK) { + ArrowIpcFooterReset(&footer); + co_return rc; + } } - std::fclose(file_); + ArrowIpcEncoder encoder; + ArrowIpcEncoderInit(&encoder); + + ArrowError error; + int rc = ArrowIpcEncoderEncodeFooter(&encoder, &footer, &error); + if (rc != NANOARROW_OK) { + ArrowIpcEncoderReset(&encoder); + ArrowIpcFooterReset(&footer); + co_return rc; + } + + ArrowBuffer footer_buf; + ArrowBufferInit(&footer_buf); + rc = ArrowIpcEncoderFinalizeBuffer(&encoder, 0, &footer_buf); + ArrowIpcEncoderReset(&encoder); + if (rc != NANOARROW_OK) { + ArrowBufferReset(&footer_buf); + ArrowIpcFooterReset(&footer); + co_return rc; + } + + // Build footer: EOS marker + footer + footer_size + magic + std::vector footer_data; + footer_data.resize(8 + footer_buf.size_bytes + 4 + 6); + + std::int32_t eos_continuation = -1; + std::int32_t eos_size = 0; + std::memcpy(footer_data.data(), &eos_continuation, 4); + std::memcpy(footer_data.data() + 4, &eos_size, 4); + std::memcpy(footer_data.data() + 8, footer_buf.data, footer_buf.size_bytes); + + std::int32_t footer_size = static_cast(footer_buf.size_bytes); + std::memcpy(footer_data.data() + 8 + footer_buf.size_bytes, &footer_size, + 4); + std::memcpy(footer_data.data() + 8 + footer_buf.size_bytes + 4, "ARROW1", + 6); + + ArrowBufferReset(&footer_buf); + ArrowIpcFooterReset(&footer); + + // Write footer + auto result = co_await io::pwrite(fd_, footer_data.data(), + footer_data.size(), write_offset_); + + if (result < 0) { + co_return static_cast(result); + } + + write_offset_ += result; + co_return 0; +} + +// --------------------------------------------------------------------------- +// close +// --------------------------------------------------------------------------- + +coro::CoroTask IpcWriter::close() { + if (!is_open()) co_return 0; + + int rc = 0; + + // Write footer + if (schema_written_ && batch_blocks_) { + rc = co_await write_footer(); + } + + // Cleanup + if (schema_copy_) { + auto* schema = static_cast(schema_copy_); + if (schema->release) schema->release(schema); + delete schema; + } + if (batch_blocks_) { + delete static_cast*>(batch_blocks_); + } + + // Fsync and close + co_await io::fsync(fd_); + co_await io::close(fd_); + reset_state(); - return rc; + co_return rc; } } // namespace dftracer::utils::utilities::common::arrow diff --git a/src/dftracer/utils/utilities/common/arrow/parallel_reader.cpp b/src/dftracer/utils/utilities/common/arrow/parallel_reader.cpp new file mode 100644 index 00000000..fba3cdc3 --- /dev/null +++ b/src/dftracer/utils/utilities/common/arrow/parallel_reader.cpp @@ -0,0 +1,111 @@ +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#include +#include +#include +#include + +#include + +namespace dftracer::utils::utilities::common::arrow { + +using dftracer::utils::coro::CoroTask; +using dftracer::utils::coro::when_all; + +CoroTask read_arrow_file_async(std::string path) { + ArrowFileReadResult result; + result.path = path; + + try { + IpcReader reader; + int rc = reader.open(path); + if (rc != 0) { + result.success = false; + result.error = "Failed to open file: " + path; + co_return result; + } + + *result.batches = reader.read_all(); + for (const auto& batch : *result.batches) { + result.total_rows += batch.num_rows(); + } + + result.success = true; + } catch (const std::exception& e) { + result.success = false; + result.error = e.what(); + } + + co_return result; +} + +CoroTask read_arrow_files_parallel( + std::vector paths) { + ParallelReadResult result; + + if (paths.empty()) { + co_return result; + } + + std::vector> tasks; + tasks.reserve(paths.size()); + + for (auto& path : paths) { + tasks.push_back(read_arrow_file_async(std::move(path))); + } + + result.file_results = co_await when_all(std::move(tasks)); + + for (const auto& fr : result.file_results) { + if (fr.success) { + result.files_read++; + result.total_rows += fr.total_rows; + result.total_batches += fr.batches->size(); + } else { + result.files_failed++; + } + } + + co_return result; +} + +CoroTask read_arrow_files_streaming( + CoroScope& /*scope*/, std::vector paths, + FileResultCallback callback) { + if (paths.empty()) { + co_return ParallelReadResult{}; + } + + std::vector> tasks; + tasks.reserve(paths.size()); + + for (auto& path : paths) { + tasks.push_back(read_arrow_file_async(std::move(path))); + } + + auto results = co_await when_all(std::move(tasks)); + + ParallelReadResult summary; + bool cancelled = false; + + for (auto& result : results) { + if (result.success) { + summary.files_read++; + summary.total_rows += result.total_rows; + summary.total_batches += result.batches->size(); + } else { + summary.files_failed++; + } + + if (!cancelled && !callback(std::move(result))) { + cancelled = true; + } + } + + co_return summary; +} + +} // namespace dftracer::utils::utilities::common::arrow + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC diff --git a/src/dftracer/utils/utilities/common/arrow/partition_router.cpp b/src/dftracer/utils/utilities/common/arrow/partition_router.cpp new file mode 100644 index 00000000..d7dad205 --- /dev/null +++ b/src/dftracer/utils/utilities/common/arrow/partition_router.cpp @@ -0,0 +1,623 @@ +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::common::arrow { + +namespace { + +std::string extract_string(const ArrowArrayView* view, int64_t idx) { + ArrowStringView sv = ArrowArrayViewGetStringUnsafe(view, idx); + return std::string(sv.data, sv.size_bytes); +} + +int64_t extract_int64(const ArrowArrayView* view, int64_t idx) { + return ArrowArrayViewGetIntUnsafe(view, idx); +} + +uint64_t extract_uint64(const ArrowArrayView* view, int64_t idx) { + return static_cast(ArrowArrayViewGetUIntUnsafe(view, idx)); +} + +double extract_double(const ArrowArrayView* view, int64_t idx) { + return ArrowArrayViewGetDoubleUnsafe(view, idx); +} + +bool is_null(const ArrowArrayView* view, int64_t idx) { + return ArrowArrayViewIsNull(view, idx); +} + +std::string value_to_string(const ArrowArrayView* view, int64_t idx) { + if (is_null(view, idx)) { + return "__null__"; + } + + switch (view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + return extract_string(view, idx); + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + return std::to_string(extract_int64(view, idx)); + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + return std::to_string(extract_uint64(view, idx)); + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_FLOAT: + return std::to_string(extract_double(view, idx)); + case NANOARROW_TYPE_BOOL: + return extract_int64(view, idx) ? "true" : "false"; + default: + return "__unsupported__"; + } +} + +ColumnType nanoarrow_to_column_type(ArrowType type) { + switch (type) { + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + return ColumnType::INT64; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + return ColumnType::UINT64; + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_FLOAT: + return ColumnType::DOUBLE; + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + return ColumnType::STRING; + case NANOARROW_TYPE_BOOL: + return ColumnType::BOOL; + default: + return ColumnType::STRING; + } +} + +uint64_t fnv1a_hash(const std::string& s) { + uint64_t hash = 14695981039346656037ULL; + for (char c : s) { + hash ^= static_cast(c); + hash *= 1099511628211ULL; + } + return hash; +} + +} // namespace + +PartitionRouter::~PartitionRouter() {} + +PartitionRouter::PartitionRouter(PartitionRouter&& other) noexcept + : output_dir_(std::move(other.output_dir_)), + config_(std::move(other.config_)), + chunk_size_bytes_(other.chunk_size_bytes_), + compression_(other.compression_), + is_open_(other.is_open_), + writers_(std::move(other.writers_)), + predicates_(std::move(other.predicates_)) { + other.is_open_ = false; +} + +PartitionRouter& PartitionRouter::operator=(PartitionRouter&& other) noexcept { + if (this != &other) { + output_dir_ = std::move(other.output_dir_); + config_ = std::move(other.config_); + chunk_size_bytes_ = other.chunk_size_bytes_; + compression_ = other.compression_; + is_open_ = other.is_open_; + writers_ = std::move(other.writers_); + predicates_ = std::move(other.predicates_); + other.is_open_ = false; + } + return *this; +} + +int PartitionRouter::open(const std::string& output_dir, + const PartitionConfig& config, + int64_t chunk_size_bytes, + IpcCompression compression) { + if (is_open_) return -1; + + std::error_code ec; + fs::create_directories(output_dir, ec); + if (ec) return -1; + + output_dir_ = output_dir; + config_ = config; + chunk_size_bytes_ = chunk_size_bytes; + compression_ = compression; + writers_.clear(); + predicates_.clear(); + + is_open_ = true; + return 0; +} + +void PartitionRouter::register_predicate(const std::string& view_name, + PredicateEvaluator evaluator) { + predicates_[view_name] = std::move(evaluator); +} + +std::string PartitionRouter::partition_path( + const std::string& partition_key) const { + if (partition_key.empty()) { + return output_dir_; + } + return (fs::path(output_dir_) / partition_key).string(); +} + +coro::CoroTask PartitionRouter::get_or_create_writer( + const std::string& partition_key) { + auto it = writers_.find(partition_key); + if (it != writers_.end()) { + co_return it->second.get(); + } + + auto writer = std::make_unique(); + std::string path = partition_path(partition_key); + if (co_await writer->open(path, chunk_size_bytes_, compression_) != 0) { + co_return nullptr; + } + + PartitionWriter* ptr = writer.get(); + writers_[partition_key] = std::move(writer); + co_return ptr; +} + +int PartitionRouter::compute_bucket( + const std::vector& values) const { + std::string combined; + for (const auto& v : values) { + combined += v; + combined += '\0'; + } + return static_cast(fnv1a_hash(combined) % + static_cast(config_.num_buckets)); +} + +coro::CoroTask PartitionRouter::route_none(ArrowExportResult& batch) { + PartitionWriter* writer = co_await get_or_create_writer(""); + if (!writer) co_return -1; + co_return co_await writer->write_batch(batch); +} + +coro::CoroTask PartitionRouter::route_column(ArrowExportResult& batch) { + ArrowSchema* schema = batch.get_schema(); + ArrowArray* array = batch.get_array(); + int64_t num_rows = batch.num_rows(); + + if (num_rows == 0) co_return 0; + + ArrowArrayView view; + ArrowError error; + int rc = ArrowArrayViewInitFromSchema(&view, schema, &error); + if (rc != NANOARROW_OK) { + ArrowArrayViewReset(&view); + co_return rc; + } + rc = ArrowArrayViewSetArray(&view, array, &error); + if (rc != NANOARROW_OK) { + ArrowArrayViewReset(&view); + co_return rc; + } + + std::vector partition_col_indices; + for (const auto& col_name : config_.partition_columns) { + for (int64_t i = 0; i < schema->n_children; i++) { + if (schema->children[i]->name == col_name) { + partition_col_indices.push_back(i); + break; + } + } + } + + if (partition_col_indices.size() != config_.partition_columns.size()) { + ArrowArrayViewReset(&view); + co_return -1; + } + + std::unordered_map> partition_rows; + + for (int64_t row = 0; row < num_rows; row++) { + std::string partition_key; + for (size_t i = 0; i < partition_col_indices.size(); i++) { + int64_t col_idx = partition_col_indices[i]; + const ArrowArrayView* col_view = view.children[col_idx]; + std::string value = value_to_string(col_view, row); + + if (i > 0) partition_key += "/"; + partition_key += config_.partition_columns[i] + "=" + value; + } + partition_rows[partition_key].push_back(row); + } + + std::vector col_specs; + col_specs.reserve(schema->n_children); + for (int64_t i = 0; i < schema->n_children; i++) { + ArrowType type = view.children[i]->storage_type; + col_specs.push_back( + {schema->children[i]->name, nanoarrow_to_column_type(type)}); + } + + for (auto& [partition_key, rows] : partition_rows) { + RecordBatchBuilder builder; + builder.declare_schema(col_specs); + builder.reserve(rows.size()); + + for (int64_t row : rows) { + for (int64_t col = 0; col < schema->n_children; col++) { + const ArrowArrayView* col_view = view.children[col]; + + if (is_null(col_view, row)) { + builder.append_null(col); + } else { + switch (col_view->storage_type) { + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + builder.append_int64(col, + extract_int64(col_view, row)); + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + builder.append_uint64( + col, extract_uint64(col_view, row)); + break; + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_FLOAT: + builder.append_double( + col, extract_double(col_view, row)); + break; + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + builder.append_string( + col, extract_string(col_view, row)); + break; + case NANOARROW_TYPE_BOOL: + builder.append_bool( + col, extract_int64(col_view, row) != 0); + break; + default: + builder.append_null(col); + break; + } + } + } + builder.end_row(); + } + + auto sub_batch = builder.finish(); + PartitionWriter* writer = co_await get_or_create_writer(partition_key); + if (!writer) { + ArrowArrayViewReset(&view); + co_return -1; + } + rc = co_await writer->write_batch(sub_batch); + if (rc != 0) { + ArrowArrayViewReset(&view); + co_return rc; + } + } + + ArrowArrayViewReset(&view); + co_return 0; +} + +coro::CoroTask PartitionRouter::route_bucketed(ArrowExportResult& batch) { + ArrowSchema* schema = batch.get_schema(); + ArrowArray* array = batch.get_array(); + int64_t num_rows = batch.num_rows(); + + if (num_rows == 0) co_return 0; + + ArrowArrayView view; + ArrowError error; + int rc = ArrowArrayViewInitFromSchema(&view, schema, &error); + if (rc != NANOARROW_OK) { + ArrowArrayViewReset(&view); + co_return rc; + } + rc = ArrowArrayViewSetArray(&view, array, &error); + if (rc != NANOARROW_OK) { + ArrowArrayViewReset(&view); + co_return rc; + } + + std::vector partition_col_indices; + for (const auto& col_name : config_.partition_columns) { + for (int64_t i = 0; i < schema->n_children; i++) { + if (schema->children[i]->name == col_name) { + partition_col_indices.push_back(i); + break; + } + } + } + + if (partition_col_indices.size() != config_.partition_columns.size()) { + ArrowArrayViewReset(&view); + co_return -1; + } + + std::unordered_map> bucket_rows; + + for (int64_t row = 0; row < num_rows; row++) { + std::vector values; + values.reserve(partition_col_indices.size()); + for (int64_t col_idx : partition_col_indices) { + values.push_back(value_to_string(view.children[col_idx], row)); + } + int bucket = compute_bucket(values); + bucket_rows[bucket].push_back(row); + } + + std::vector col_specs; + col_specs.reserve(schema->n_children); + for (int64_t i = 0; i < schema->n_children; i++) { + col_specs.push_back( + {schema->children[i]->name, + nanoarrow_to_column_type(view.children[i]->storage_type)}); + } + + auto bucket_key = [&](int bucket) { + std::ostringstream ss; + ss << config_.partition_columns[0] << "_bucket=" << std::setw(2) + << std::setfill('0') << bucket; + return ss.str(); + }; + + for (auto& [bucket, rows] : bucket_rows) { + RecordBatchBuilder builder; + builder.declare_schema(col_specs); + builder.reserve(rows.size()); + + for (int64_t row : rows) { + for (int64_t col = 0; col < schema->n_children; col++) { + const ArrowArrayView* col_view = view.children[col]; + + if (is_null(col_view, row)) { + builder.append_null(col); + } else { + switch (col_view->storage_type) { + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + builder.append_int64(col, + extract_int64(col_view, row)); + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + builder.append_uint64( + col, extract_uint64(col_view, row)); + break; + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_FLOAT: + builder.append_double( + col, extract_double(col_view, row)); + break; + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + builder.append_string( + col, extract_string(col_view, row)); + break; + case NANOARROW_TYPE_BOOL: + builder.append_bool( + col, extract_int64(col_view, row) != 0); + break; + default: + builder.append_null(col); + break; + } + } + } + builder.end_row(); + } + + auto sub_batch = builder.finish(); + PartitionWriter* writer = + co_await get_or_create_writer(bucket_key(bucket)); + if (!writer) { + ArrowArrayViewReset(&view); + co_return -1; + } + rc = co_await writer->write_batch(sub_batch); + if (rc != 0) { + ArrowArrayViewReset(&view); + co_return rc; + } + } + + ArrowArrayViewReset(&view); + co_return 0; +} + +coro::CoroTask PartitionRouter::route_view(ArrowExportResult& batch) { + ArrowSchema* schema = batch.get_schema(); + ArrowArray* array = batch.get_array(); + int64_t num_rows = batch.num_rows(); + + if (num_rows == 0) co_return 0; + + ArrowArrayView view; + ArrowError error; + int rc = ArrowArrayViewInitFromSchema(&view, schema, &error); + if (rc != NANOARROW_OK) { + ArrowArrayViewReset(&view); + co_return rc; + } + rc = ArrowArrayViewSetArray(&view, array, &error); + if (rc != NANOARROW_OK) { + ArrowArrayViewReset(&view); + co_return rc; + } + + std::unordered_map col_name_to_idx; + for (int64_t i = 0; i < schema->n_children; i++) { + col_name_to_idx[schema->children[i]->name] = i; + } + + std::unordered_map> view_rows; + + for (int64_t row = 0; row < num_rows; row++) { + std::unordered_map row_values; + for (int64_t col = 0; col < schema->n_children; col++) { + row_values[schema->children[col]->name] = + value_to_string(view.children[col], row); + } + + std::string matched_view; + for (const auto& [view_name, predicate] : config_.views) { + if (!predicate.has_value()) { + if (matched_view.empty()) { + matched_view = view_name; + } + continue; + } + + auto it = predicates_.find(view_name); + if (it != predicates_.end() && it->second(row_values)) { + matched_view = view_name; + break; + } + } + + if (!matched_view.empty()) { + view_rows[matched_view].push_back(row); + } + } + + std::vector col_specs; + col_specs.reserve(schema->n_children); + for (int64_t i = 0; i < schema->n_children; i++) { + col_specs.push_back( + {schema->children[i]->name, + nanoarrow_to_column_type(view.children[i]->storage_type)}); + } + + for (auto& [view_name, rows] : view_rows) { + RecordBatchBuilder builder; + builder.declare_schema(col_specs); + builder.reserve(rows.size()); + + for (int64_t row : rows) { + for (int64_t col = 0; col < schema->n_children; col++) { + const ArrowArrayView* col_view = view.children[col]; + + if (is_null(col_view, row)) { + builder.append_null(col); + } else { + switch (col_view->storage_type) { + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + builder.append_int64(col, + extract_int64(col_view, row)); + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + builder.append_uint64( + col, extract_uint64(col_view, row)); + break; + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_FLOAT: + builder.append_double( + col, extract_double(col_view, row)); + break; + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + builder.append_string( + col, extract_string(col_view, row)); + break; + case NANOARROW_TYPE_BOOL: + builder.append_bool( + col, extract_int64(col_view, row) != 0); + break; + default: + builder.append_null(col); + break; + } + } + } + builder.end_row(); + } + + auto sub_batch = builder.finish(); + PartitionWriter* writer = co_await get_or_create_writer(view_name); + if (!writer) { + ArrowArrayViewReset(&view); + co_return -1; + } + rc = co_await writer->write_batch(sub_batch); + if (rc != 0) { + ArrowArrayViewReset(&view); + co_return rc; + } + } + + ArrowArrayViewReset(&view); + co_return 0; +} + +coro::CoroTask PartitionRouter::write_batch(ArrowExportResult& batch) { + if (!is_open_ || !batch.valid()) co_return -1; + + switch (config_.mode) { + case PartitionConfig::Mode::NONE: + co_return co_await route_none(batch); + case PartitionConfig::Mode::COLUMN: + co_return co_await route_column(batch); + case PartitionConfig::Mode::BUCKETED: + co_return co_await route_bucketed(batch); + case PartitionConfig::Mode::VIEW: + co_return co_await route_view(batch); + } + co_return -1; +} + +coro::CoroTask PartitionRouter::close() { + RouterWriteStats stats; + + if (!is_open_) co_return stats; + + for (auto& [partition_key, writer] : writers_) { + auto partition_stats = co_await writer->close(); + stats.partitions[partition_key] = std::move(partition_stats); + stats.total_rows += stats.partitions[partition_key].total_rows; + stats.total_uncompressed_bytes += + stats.partitions[partition_key].total_uncompressed_bytes; + } + + writers_.clear(); + predicates_.clear(); + is_open_ = false; + + co_return stats; +} + +} // namespace dftracer::utils::utilities::common::arrow + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC diff --git a/src/dftracer/utils/utilities/common/arrow/partition_writer.cpp b/src/dftracer/utils/utilities/common/arrow/partition_writer.cpp new file mode 100644 index 00000000..d642925a --- /dev/null +++ b/src/dftracer/utils/utilities/common/arrow/partition_writer.cpp @@ -0,0 +1,207 @@ +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#include +#include +#include + +#include +#include +#include + +namespace dftracer::utils::utilities::common::arrow { + +PartitionWriter::~PartitionWriter() {} + +PartitionWriter::PartitionWriter(PartitionWriter&& other) noexcept + : output_dir_(std::move(other.output_dir_)), + chunk_size_bytes_(other.chunk_size_bytes_), + compression_(other.compression_), + writer_(std::move(other.writer_)), + is_open_(other.is_open_), + file_index_(other.file_index_), + current_file_bytes_(other.current_file_bytes_), + current_file_rows_(other.current_file_rows_), + total_bytes_(other.total_bytes_), + total_rows_(other.total_rows_), + files_(std::move(other.files_)), + row_counts_(std::move(other.row_counts_)) { + other.is_open_ = false; + other.file_index_ = 0; + other.current_file_bytes_ = 0; + other.current_file_rows_ = 0; + other.total_bytes_ = 0; + other.total_rows_ = 0; +} + +PartitionWriter& PartitionWriter::operator=(PartitionWriter&& other) noexcept { + if (this != &other) { + output_dir_ = std::move(other.output_dir_); + chunk_size_bytes_ = other.chunk_size_bytes_; + compression_ = other.compression_; + writer_ = std::move(other.writer_); + is_open_ = other.is_open_; + file_index_ = other.file_index_; + current_file_bytes_ = other.current_file_bytes_; + current_file_rows_ = other.current_file_rows_; + total_bytes_ = other.total_bytes_; + total_rows_ = other.total_rows_; + files_ = std::move(other.files_); + row_counts_ = std::move(other.row_counts_); + + other.is_open_ = false; + other.file_index_ = 0; + other.current_file_bytes_ = 0; + other.current_file_rows_ = 0; + other.total_bytes_ = 0; + other.total_rows_ = 0; + } + return *this; +} + +std::string PartitionWriter::generate_filename() const { + std::ostringstream ss; + ss << "part-" << std::setw(5) << std::setfill('0') << file_index_ + << ".arrow"; + return (fs::path(output_dir_) / ss.str()).string(); +} + +coro::CoroTask PartitionWriter::open(const std::string& output_dir, + int64_t chunk_size_bytes, + IpcCompression compression) { + if (is_open_) co_return -1; + + std::error_code ec; + fs::create_directories(output_dir, ec); + if (ec) co_return -1; + + output_dir_ = output_dir; + chunk_size_bytes_ = chunk_size_bytes; + compression_ = compression; + file_index_ = 0; + current_file_bytes_ = 0; + current_file_rows_ = 0; + total_bytes_ = 0; + total_rows_ = 0; + files_.clear(); + row_counts_.clear(); + + std::string path = generate_filename(); + int rc = co_await writer_.open(path, compression_); + if (rc != 0) co_return rc; + + is_open_ = true; + co_return 0; +} + +int64_t PartitionWriter::calculate_uncompressed_size(ArrowExportResult& batch) { + ArrowSchema* schema = batch.get_schema(); + ArrowArray* array = batch.get_array(); + + ArrowArrayView view; + ArrowError error; + int rc = ArrowArrayViewInitFromSchema(&view, schema, &error); + if (rc != NANOARROW_OK) { + ArrowArrayViewReset(&view); + return 0; + } + + rc = ArrowArrayViewSetArray(&view, array, &error); + if (rc != NANOARROW_OK) { + ArrowArrayViewReset(&view); + return 0; + } + + // Calculate total buffer size recursively + int64_t total = 0; + struct BufferCounter { + static void count(ArrowArrayView* v, int64_t& total, bool is_root) { + if (!is_root) { + int64_t num_buffers = ArrowArrayViewGetNumBuffers(v); + for (int64_t i = 0; i < num_buffers; i++) { + ArrowBufferView buf = ArrowArrayViewGetBufferView(v, i); + total += buf.size_bytes; + } + } + for (int64_t i = 0; i < v->n_children; i++) { + count(v->children[i], total, false); + } + } + }; + BufferCounter::count(&view, total, true); + + ArrowArrayViewReset(&view); + return total; +} + +coro::CoroTask PartitionWriter::rotate_file() { + co_await writer_.close(); + + files_.push_back(generate_filename()); + row_counts_.push_back(current_file_rows_); + + file_index_++; + current_file_bytes_ = 0; + current_file_rows_ = 0; + + std::string path = generate_filename(); + co_return co_await writer_.open(path, compression_); +} + +coro::CoroTask PartitionWriter::write_batch(ArrowExportResult& batch) { + if (!is_open_ || !batch.valid()) co_return -1; + + int64_t batch_size = calculate_uncompressed_size(batch); + int64_t batch_rows = batch.num_rows(); + + // Check if we need to rotate before writing + // (only rotate if we've written something and adding this batch exceeds + // limit) + if (chunk_size_bytes_ > 0 && current_file_bytes_ > 0 && + current_file_bytes_ + batch_size > chunk_size_bytes_) { + int rc = co_await rotate_file(); + if (rc != 0) co_return rc; + } + + int rc = co_await writer_.write_batch(batch); + if (rc != 0) co_return rc; + + current_file_bytes_ += batch_size; + current_file_rows_ += batch_rows; + total_bytes_ += batch_size; + total_rows_ += batch_rows; + + co_return 0; +} + +coro::CoroTask PartitionWriter::close() { + PartitionWriteStats stats; + + if (!is_open_) co_return stats; + + co_await writer_.close(); + + // Record final file stats (only if rows were written) + if (current_file_rows_ > 0) { + files_.push_back(generate_filename()); + row_counts_.push_back(current_file_rows_); + } + + stats.files = std::move(files_); + stats.row_counts = std::move(row_counts_); + stats.total_rows = total_rows_; + stats.total_uncompressed_bytes = total_bytes_; + + is_open_ = false; + file_index_ = 0; + current_file_bytes_ = 0; + current_file_rows_ = 0; + total_bytes_ = 0; + total_rows_ = 0; + + co_return stats; +} + +} // namespace dftracer::utils::utilities::common::arrow + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC diff --git a/src/dftracer/utils/utilities/common/json/json_value.cpp b/src/dftracer/utils/utilities/common/json/json_value.cpp index 989884ea..acfccaf8 100644 --- a/src/dftracer/utils/utilities/common/json/json_value.cpp +++ b/src/dftracer/utils/utilities/common/json/json_value.cpp @@ -7,9 +7,9 @@ namespace dftracer::utils::utilities::common::json { JsonValue JsonValue::at(const char* path) const { - if (!val_ || !path) return JsonValue(nullptr); + if (!valid_ || !path) return JsonValue(); - JsonValue current(val_); + JsonValue current = *this; const char* start = path; while (*start) { @@ -22,18 +22,11 @@ JsonValue JsonValue::at(const char* path) const { continue; } - char key_buf[256]; - if (key_len >= sizeof(key_buf)) { - std::string key_str(start, key_len); - current = current[key_str.c_str()]; - } else { - std::memcpy(key_buf, start, key_len); - key_buf[key_len] = '\0'; - current = current[key_buf]; - } + std::string_view key_sv(start, key_len); + current = current[key_sv]; if (!current.exists()) { - return JsonValue(nullptr); + return JsonValue(); } start = (*end == '.') ? end + 1 : end; @@ -76,23 +69,14 @@ coro::CoroTask StringJsonParserUtility::process( const StringJsonParserInput& input) { content_ = input.content; - yyjson_doc* doc = - yyjson_read(content_.content.data(), content_.content.size(), 0); - - yyjson_val* json_object = nullptr; - if (doc) { - json_object = yyjson_doc_get_root(doc); - owned_doc_ = std::shared_ptr(doc, [](yyjson_doc* d) { - if (d) yyjson_doc_free(d); - }); + auto result = + parser_.parse(content_.content.data(), content_.content.size()); + if (result.error()) { + co_return JsonValue(); } - - co_return JsonValue(json_object); + co_return JsonValue(result.value_unsafe()); } -void StringJsonParserUtility::reset() { - owned_doc_.reset(); - content_ = utilities::text::Text{}; -} +void StringJsonParserUtility::reset() { content_ = utilities::text::Text{}; } } // namespace dftracer::utils::utilities::common::json diff --git a/src/dftracer/utils/utilities/common/json/parser.cpp b/src/dftracer/utils/utilities/common/json/parser.cpp new file mode 100644 index 00000000..1f583f72 --- /dev/null +++ b/src/dftracer/utils/utilities/common/json/parser.cpp @@ -0,0 +1,73 @@ +#include + +namespace dftracer::utils::utilities::common::json { + +JsonParser::JsonParser(std::size_t capacity) : parser_(capacity) {} + +bool JsonParser::parse(std::string_view json_line) { + padded_json_ = simdjson::padded_string(json_line); + auto result = parser_.iterate(padded_json_); + if (result.error()) { + valid_ = false; + return false; + } + doc_ = std::move(result.value()); + active_ = simdjson::ondemand::document_reference(doc_); + valid_ = true; + return true; +} + +bool JsonParser::parse_padded(simdjson::padded_string_view json) { + auto result = parser_.iterate(json); + if (result.error()) { + valid_ = false; + return false; + } + doc_ = std::move(result.value()); + active_ = simdjson::ondemand::document_reference(doc_); + valid_ = true; + return true; +} + +void JsonParser::rewind() { + if (valid_) { + active_.rewind(); + } +} + +std::optional JsonParser::get_int64(std::string_view key) { + if (!valid_) return std::nullopt; + auto result = active_[key].get_int64(); + if (result.error()) return std::nullopt; + return result.value(); +} + +std::optional JsonParser::get_uint64(std::string_view key) { + if (!valid_) return std::nullopt; + auto result = active_[key].get_uint64(); + if (result.error()) return std::nullopt; + return result.value(); +} + +std::optional JsonParser::get_double(std::string_view key) { + if (!valid_) return std::nullopt; + auto result = active_[key].get_double(); + if (result.error()) return std::nullopt; + return result.value(); +} + +std::optional JsonParser::get_bool(std::string_view key) { + if (!valid_) return std::nullopt; + auto result = active_[key].get_bool(); + if (result.error()) return std::nullopt; + return result.value(); +} + +std::optional JsonParser::get_string(std::string_view key) { + if (!valid_) return std::nullopt; + auto result = active_[key].get_string(); + if (result.error()) return std::nullopt; + return result.value(); +} + +} // namespace dftracer::utils::utilities::common::json diff --git a/src/dftracer/utils/utilities/common/query/ast.cpp b/src/dftracer/utils/utilities/common/query/ast.cpp index 3ee47932..7c5914f1 100644 --- a/src/dftracer/utils/utilities/common/query/ast.cpp +++ b/src/dftracer/utils/utilities/common/query/ast.cpp @@ -87,6 +87,30 @@ void node_to_string(std::ostringstream& os, const QueryNode& node) { node.data); } +void collect_fields_impl(const QueryNode& node, + dftracer::utils::StringViewSet& out) { + std::visit( + [&out](auto&& n) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + out.insert(n.field.path); + } else if constexpr (std::is_same_v) { + out.insert(n.field.path); + } else if constexpr (std::is_same_v) { + out.insert(n.field.path); + } else if constexpr (std::is_same_v) { + collect_fields_impl(*n.left, out); + collect_fields_impl(*n.right, out); + } else if constexpr (std::is_same_v) { + collect_fields_impl(*n.left, out); + collect_fields_impl(*n.right, out); + } else if constexpr (std::is_same_v) { + collect_fields_impl(*n.operand, out); + } + }, + node.data); +} + } // namespace std::string to_string(const QueryNode& node) { @@ -95,4 +119,10 @@ std::string to_string(const QueryNode& node) { return os.str(); } +dftracer::utils::StringViewSet collect_fields(const QueryNode& node) { + dftracer::utils::StringViewSet fields; + collect_fields_impl(node, fields); + return fields; +} + } // namespace dftracer::utils::utilities::common::query diff --git a/src/dftracer/utils/utilities/common/query/query.cpp b/src/dftracer/utils/utilities/common/query/query.cpp index b3fac100..3db5b005 100644 --- a/src/dftracer/utils/utilities/common/query/query.cpp +++ b/src/dftracer/utils/utilities/common/query/query.cpp @@ -2,7 +2,8 @@ namespace dftracer::utils::utilities::common::query { -Query::Query(const Query& other) : source_(other.source_) { +Query::Query(const Query& other) + : source_(other.source_), fields_(other.fields_) { auto result = parse(source_); if (!result) throw QueryParseError(result.error()); root_ = std::move(*result); @@ -11,6 +12,7 @@ Query::Query(const Query& other) : source_(other.source_) { Query& Query::operator=(const Query& other) { if (this != &other) { source_ = other.source_; + fields_ = other.fields_; auto result = parse(source_); if (!result) throw QueryParseError(result.error()); root_ = std::move(*result); diff --git a/src/dftracer/utils/utilities/common/statistics/log2_histogram.cpp b/src/dftracer/utils/utilities/common/statistics/log2_histogram.cpp index d1ae12f4..1a731e1e 100644 --- a/src/dftracer/utils/utilities/common/statistics/log2_histogram.cpp +++ b/src/dftracer/utils/utilities/common/statistics/log2_histogram.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include @@ -197,62 +197,51 @@ std::string Log2Histogram::render_blocks(std::size_t max_width, return out.str(); } -yyjson_mut_val* Log2Histogram::to_yyjson(yyjson_mut_doc* doc) const { - yyjson_mut_val* arr = yyjson_mut_arr(doc); +std::string Log2Histogram::to_json() const { + std::ostringstream ss; + ss << '['; + bool first = true; for (std::size_t i = 0; i < NUM_BINS; ++i) { if (bins_[i] == 0) continue; - yyjson_mut_val* pair = yyjson_mut_arr(doc); - yyjson_mut_arr_add_uint(doc, pair, static_cast(i)); - yyjson_mut_arr_add_uint(doc, pair, bins_[i]); - yyjson_mut_arr_append(arr, pair); + if (!first) ss << ','; + first = false; + ss << '[' << i << ',' << bins_[i] << ']'; } - return arr; -} - -std::string Log2Histogram::to_json() const { - yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr); - yyjson_mut_val* arr = to_yyjson(doc); - yyjson_mut_doc_set_root(doc, arr); - - char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_NOFLAG, nullptr); - std::string result(json_str ? json_str : "[]"); - if (json_str) free(json_str); - yyjson_mut_doc_free(doc); - return result; + ss << ']'; + return ss.str(); } Log2Histogram Log2Histogram::from_json(const std::string& json) { Log2Histogram hist; - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - if (!doc) return hist; + simdjson::dom::parser parser; + auto result = parser.parse(json.data(), json.size()); + if (result.error()) return hist; - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_arr(root)) { - yyjson_doc_free(doc); - return hist; - } + auto root = result.value_unsafe(); + if (!root.is_array()) return hist; + + simdjson::dom::array root_arr; + if (root.get(root_arr)) return hist; - std::size_t idx, max; - yyjson_val* pair; - yyjson_arr_foreach(root, idx, max, pair) { - if (!yyjson_is_arr(pair) || yyjson_arr_size(pair) != 2) continue; - yyjson_val* bin_idx_val = yyjson_arr_get(pair, 0); - yyjson_val* count_val = yyjson_arr_get(pair, 1); - if (!yyjson_is_uint(bin_idx_val) || !yyjson_is_uint(count_val)) - continue; + for (auto pair : root_arr) { + if (!pair.is_array()) continue; + simdjson::dom::array arr; + if (pair.get(arr)) continue; + if (arr.size() != 2) continue; + + auto bin_idx_result = arr.at(0).get_uint64(); + auto count_result = arr.at(1).get_uint64(); + if (bin_idx_result.error() || count_result.error()) continue; std::size_t bin_idx = - static_cast(yyjson_get_uint(bin_idx_val)); - std::uint64_t count = yyjson_get_uint(count_val); + static_cast(bin_idx_result.value_unsafe()); + std::uint64_t count = count_result.value_unsafe(); if (bin_idx < NUM_BINS) { hist.bins_[bin_idx] += count; hist.total_count_ += count; } } - - yyjson_doc_free(doc); return hist; } diff --git a/src/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp b/src/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp new file mode 100644 index 00000000..487af91f --- /dev/null +++ b/src/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp @@ -0,0 +1,173 @@ +#include + +#include +#include + +namespace dftracer::utils::utilities::common::statistics { + +void TimestampHistogram::add(std::uint64_t timestamp_us) { + std::uint64_t idx = bin_index(timestamp_us); + total_count_++; + + auto it = std::lower_bound( + bins_.begin(), bins_.end(), idx, + [](const auto& p, std::uint64_t val) { return p.first < val; }); + + if (it != bins_.end() && it->first == idx) { + it->second++; + } else { + bins_.insert(it, {idx, 1}); + } +} + +void TimestampHistogram::merge(const TimestampHistogram& other) { + if (other.bins_.empty()) return; + + std::vector> merged; + merged.reserve(bins_.size() + other.bins_.size()); + + auto a = bins_.begin(); + auto b = other.bins_.begin(); + + while (a != bins_.end() && b != other.bins_.end()) { + if (a->first < b->first) { + merged.push_back(*a++); + } else if (a->first > b->first) { + merged.push_back(*b++); + } else { + merged.push_back({a->first, a->second + b->second}); + ++a; + ++b; + } + } + while (a != bins_.end()) merged.push_back(*a++); + while (b != other.bins_.end()) merged.push_back(*b++); + + bins_ = std::move(merged); + total_count_ += other.total_count_; +} + +std::uint64_t TimestampHistogram::count_in_range( + std::uint64_t ts_start_us, std::uint64_t ts_end_us) const { + if (bins_.empty() || ts_start_us >= ts_end_us) return 0; + + std::uint64_t start_bin = bin_index(ts_start_us); + std::uint64_t end_bin = bin_index(ts_end_us - 1); + + auto it = std::lower_bound( + bins_.begin(), bins_.end(), start_bin, + [](const auto& p, std::uint64_t val) { return p.first < val; }); + + std::uint64_t count = 0; + for (; it != bins_.end() && it->first <= end_bin; ++it) { + count += it->second; + } + return count; +} + +double TimestampHistogram::selectivity(std::uint64_t ts_start_us, + std::uint64_t ts_end_us) const { + if (total_count_ == 0) return 0.0; + return static_cast(count_in_range(ts_start_us, ts_end_us)) / + static_cast(total_count_); +} + +std::vector TimestampHistogram::expansion_weights( + std::uint64_t bucket_start_us, std::uint64_t bucket_end_us, + std::size_t num_sub_buckets) const { + std::vector weights(num_sub_buckets, 0.0); + if (num_sub_buckets == 0 || bucket_start_us >= bucket_end_us) + return weights; + + std::uint64_t sub_width = + (bucket_end_us - bucket_start_us) / num_sub_buckets; + if (sub_width == 0) sub_width = 1; + + std::uint64_t total_in_range = 0; + for (std::size_t i = 0; i < num_sub_buckets; ++i) { + std::uint64_t sub_start = bucket_start_us + i * sub_width; + std::uint64_t sub_end = (i + 1 < num_sub_buckets) + ? bucket_start_us + (i + 1) * sub_width + : bucket_end_us; + std::uint64_t c = count_in_range(sub_start, sub_end); + weights[i] = static_cast(c); + total_in_range += c; + } + + if (total_in_range > 0) { + double inv = 1.0 / static_cast(total_in_range); + for (auto& w : weights) w *= inv; + } else { + double uniform = 1.0 / static_cast(num_sub_buckets); + for (auto& w : weights) w = uniform; + } + + return weights; +} + +// Varint encoding helpers +namespace { + +void encode_varint(std::vector& out, std::uint64_t value) { + while (value >= 0x80) { + out.push_back(static_cast(value | 0x80)); + value >>= 7; + } + out.push_back(static_cast(value)); +} + +std::uint64_t decode_varint(const std::uint8_t*& ptr, const std::uint8_t* end) { + std::uint64_t result = 0; + unsigned shift = 0; + while (ptr < end) { + std::uint8_t byte = *ptr++; + result |= static_cast(byte & 0x7F) << shift; + if ((byte & 0x80) == 0) return result; + shift += 7; + } + return result; +} + +} // namespace + +std::vector TimestampHistogram::serialize() const { + std::vector out; + out.reserve(bins_.size() * 6 + 16); + + encode_varint(out, total_count_); + encode_varint(out, bins_.size()); + + std::uint64_t prev_idx = 0; + for (const auto& [idx, count] : bins_) { + encode_varint(out, idx - prev_idx); + encode_varint(out, count); + prev_idx = idx; + } + + return out; +} + +TimestampHistogram TimestampHistogram::deserialize(const std::uint8_t* data, + std::size_t len) { + TimestampHistogram hist; + if (!data || len == 0) return hist; + + const auto* ptr = data; + const auto* end = data + len; + + hist.total_count_ = decode_varint(ptr, end); + std::uint64_t num_bins = decode_varint(ptr, end); + + hist.bins_.reserve(static_cast(num_bins)); + std::uint64_t prev_idx = 0; + for (std::uint64_t i = 0; i < num_bins && ptr < end; ++i) { + std::uint64_t delta = decode_varint(ptr, end); + std::uint64_t count = decode_varint(ptr, end); + prev_idx += delta; + hist.bins_.push_back({prev_idx, count}); + } + + return hist; +} + +} // namespace dftracer::utils::utilities::common::statistics diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp new file mode 100644 index 00000000..56325907 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp @@ -0,0 +1,281 @@ +#include +#include + +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +namespace { + +using hash::Fnv1aHashBuilder; + +// Key for grouping entries during shrinking (merge) +struct MergeKey { + std::uint32_t cat_id; + std::uint32_t name_id; + std::uint64_t pid; + std::uint64_t tid; + std::uint32_t hhash_id; + std::uint32_t fhash_id; + std::uint64_t target_bucket; // computed from source bucket + + bool operator==(const MergeKey& other) const { + return cat_id == other.cat_id && name_id == other.name_id && + pid == other.pid && tid == other.tid && + hhash_id == other.hhash_id && fhash_id == other.fhash_id && + target_bucket == other.target_bucket; + } +}; + +struct MergeKeyHash { + std::size_t operator()(const MergeKey& k) const { + Fnv1aHashBuilder h; + h.update_value(k.cat_id); + h.update_value(k.name_id); + h.update_value(k.pid); + h.update_value(k.tid); + h.update_value(k.hhash_id); + h.update_value(k.fhash_id); + h.update_value(k.target_bucket); + return static_cast(h.finish()); + } +}; + +// Shrink: merge multiple source buckets into one target bucket +AggregationBatch shrink_batch(const AggregationBatch& input, + std::uint64_t source_interval_us, + std::uint64_t target_interval_us) { + AggregationBatch result; + result.batch_type = input.batch_type; + result.total_events_processed = input.total_events_processed; + result.total_files_processed = input.total_files_processed; + result.total_bytes_processed = input.total_bytes_processed; + result.has_approximated_entries = false; + result.global_extra_key_ids = input.global_extra_key_ids; + result.global_custom_metric_names = input.global_custom_metric_names; + + // Group entries by MergeKey + std::unordered_map merged; + + for (const auto& entry : input.entries) { + const auto& key = entry.key; + const auto& metrics = entry.metrics; + + // Compute target bucket from source bucket + std::uint64_t source_time = key.time_bucket * source_interval_us; + std::uint64_t target_bucket = source_time / target_interval_us; + + MergeKey mk{key.cat_id, key.name_id, key.pid, key.tid, + key.hhash_id, key.fhash_id, target_bucket}; + + auto it = merged.find(mk); + if (it == merged.end()) { + AggregationEntry new_entry; + new_entry.key = key; + new_entry.key.time_bucket = target_bucket; + new_entry.metrics = metrics; + new_entry.is_approximated = false; + merged.emplace(mk, std::move(new_entry)); + } else { + // Merge metrics + it->second.metrics.merge_from(metrics); + } + } + + result.entries.reserve(merged.size()); + for (auto& [_, entry] : merged) { + result.entries.push_back(std::move(entry)); + } + + return result; +} + +// Expand: split one source bucket into multiple target buckets +AggregationBatch expand_batch(const AggregationBatch& input, + std::uint64_t source_interval_us, + std::uint64_t target_interval_us) { + AggregationBatch result; + result.batch_type = input.batch_type; + result.total_events_processed = input.total_events_processed; + result.total_files_processed = input.total_files_processed; + result.total_bytes_processed = input.total_bytes_processed; + result.has_approximated_entries = true; + result.global_extra_key_ids = input.global_extra_key_ids; + result.global_custom_metric_names = input.global_custom_metric_names; + + for (const auto& entry : input.entries) { + const auto& key = entry.key; + const auto& metrics = entry.metrics; + + // Source bucket boundaries + std::uint64_t bucket_start = key.time_bucket * source_interval_us; + std::uint64_t bucket_end = bucket_start + source_interval_us; + + // Actual event span from ts/te + std::uint64_t ts = metrics.ts; + std::uint64_t te = metrics.te; + + // Clamp ts/te to bucket boundaries + ts = std::max(ts, bucket_start); + te = std::min(te, bucket_end); + + // Handle edge case: ts >= te (all events at same instant or invalid) + if (ts >= te) { + // Use original ts to determine target bucket + std::uint64_t original_ts = metrics.ts; + if (original_ts < bucket_start) original_ts = bucket_start; + if (original_ts >= bucket_end) original_ts = bucket_end - 1; + + std::uint64_t target_bucket = original_ts / target_interval_us; + + AggregationEntry new_entry; + new_entry.key = key; + new_entry.key.time_bucket = target_bucket; + new_entry.is_approximated = true; + new_entry.metrics = metrics; + new_entry.count_ci = + compute_poisson_ci(static_cast(metrics.count)); + + result.entries.push_back(std::move(new_entry)); + continue; + } + + std::uint64_t span = te - ts; + + // Compute first and last target buckets that overlap with [ts, te] + std::uint64_t first_target = ts / target_interval_us; + std::uint64_t last_target = (te - 1) / target_interval_us; + + // Distribute across overlapping sub-buckets + double total_weight = 0.0; + std::vector> bucket_weights; + + for (std::uint64_t tb = first_target; tb <= last_target; ++tb) { + std::uint64_t tb_start = tb * target_interval_us; + std::uint64_t tb_end = tb_start + target_interval_us; + + // Overlap with [ts, te] + std::uint64_t overlap_start = std::max(ts, tb_start); + std::uint64_t overlap_end = std::min(te, tb_end); + + if (overlap_start < overlap_end) { + double weight = + static_cast(overlap_end - overlap_start) / span; + bucket_weights.emplace_back(tb, weight); + total_weight += weight; + } + } + + // Normalize weights (should sum to ~1.0) + if (total_weight > 0.0) { + for (auto& [_, w] : bucket_weights) { + w /= total_weight; + } + } + + // Create sub-bucket entries + double count = static_cast(metrics.count); + std::uint64_t count_sum = 0; + + for (std::size_t i = 0; i < bucket_weights.size(); ++i) { + auto& [tb, weight] = bucket_weights[i]; + + AggregationEntry new_entry; + new_entry.key = key; + new_entry.key.time_bucket = tb; + new_entry.is_approximated = true; + + // Distribute count by weight + double sub_count = count * weight; + + // For the last bucket, adjust to ensure sum equals original + std::uint64_t sub_count_int; + if (i == bucket_weights.size() - 1) { + sub_count_int = metrics.count - count_sum; + } else { + sub_count_int = + static_cast(std::round(sub_count)); + count_sum += sub_count_int; + } + + // Create metrics for sub-bucket + new_entry.metrics = metrics; // Copy all fields + new_entry.metrics.count = sub_count_int; + + // Scale duration total by weight + new_entry.metrics.duration.total = static_cast( + std::round(metrics.duration.total * weight)); + new_entry.metrics.duration.count = sub_count_int; + + // Scale size total by weight + new_entry.metrics.size.total = static_cast( + std::round(metrics.size.total * weight)); + new_entry.metrics.size.count = sub_count_int; + + // Keep min/max conservative (can't know which sub-bucket had them) + // mean stays the same + // variance stays the same (conservative) + + // Compute confidence interval + new_entry.count_ci = compute_poisson_ci(sub_count); + + // Scale custom metrics by weight + if (new_entry.metrics.custom_metrics) { + for (auto& [name, stat] : *new_entry.metrics.custom_metrics) { + stat.total = static_cast( + std::round(stat.total * weight)); + stat.count = sub_count_int; + } + } + + // Only add if count > 0 + if (sub_count_int > 0) { + result.entries.push_back(std::move(new_entry)); + } + } + } + + return result; +} + +} // namespace + +AggregationBatch augment_batch(const AggregationBatch& input, + const AugmentationConfig& config) { + // Pass through if intervals match + if (config.source_interval_us == config.target_interval_us) { + AggregationBatch result; + result.batch_type = input.batch_type; + result.total_events_processed = input.total_events_processed; + result.total_files_processed = input.total_files_processed; + result.total_bytes_processed = input.total_bytes_processed; + result.has_approximated_entries = false; + result.global_extra_key_ids = input.global_extra_key_ids; + result.global_custom_metric_names = input.global_custom_metric_names; + + result.entries.reserve(input.entries.size()); + for (const auto& entry : input.entries) { + AggregationEntry new_entry; + new_entry.key = entry.key; + new_entry.metrics = entry.metrics; + new_entry.is_approximated = false; + result.entries.push_back(std::move(new_entry)); + } + + return result; + } + + // Shrink: target > source (fewer, larger buckets) + if (config.target_interval_us > config.source_interval_us) { + return shrink_batch(input, config.source_interval_us, + config.target_interval_us); + } + + // Expand: target < source (more, smaller buckets) + return expand_batch(input, config.source_interval_us, + config.target_interval_us); +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp new file mode 100644 index 00000000..66642760 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp @@ -0,0 +1,212 @@ +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +namespace { + +void apply_preaggregated_metric(MetricStats& stats, std::uint64_t ev_count, + const ArgsValueProxy& sum_val, + const ArgsValueProxy& min_val, + const ArgsValueProxy& max_val) { + if (!sum_val.exists()) return; + + const auto total = sum_val.get(); + stats.count += ev_count; + stats.total += total; + if (min_val.exists()) { + stats.min = std::min(stats.min, min_val.get()); + } + if (max_val.exists()) { + stats.max = std::max(stats.max, max_val.get()); + } + + if (stats.count > 0) { + stats.mean = + static_cast(stats.total) / static_cast(stats.count); + stats.m2 = 0.0; + } +} + +} // namespace + +std::uint64_t compute_time_bucket(std::uint64_t timestamp, + std::uint64_t duration, + const AggregationConfig& config) { + std::uint64_t midpoint = timestamp + (duration / 2); + + if (config.use_relative_time) { + midpoint -= config.reference_timestamp; + } + if (config.time_interval_us == 0) return midpoint; + return (midpoint / config.time_interval_us) * config.time_interval_us; +} + +AggregationKey build_aggregation_key(const DFTracerEvent& ev, + const AggregationConfig& config) { + auto& intern = aggregation_intern(); + + AggregationKey key; + key.cat_id = intern.get_or_insert(ev.cat); + key.name_id = intern.get_or_insert(ev.name); + key.pid = ev.pid; + key.tid = ev.tid; + + auto hhash_sv = ev.args["hhash"].get(); + if (!hhash_sv.empty()) { + key.hhash_id = intern.get_or_insert(hhash_sv); + } + auto fhash_sv = ev.args["fhash"].get(); + if (!fhash_sv.empty()) { + key.fhash_id = intern.get_or_insert(fhash_sv); + } + + key.time_bucket = compute_time_bucket(ev.ts, ev.dur, config); + + if (!config.extra_group_keys.empty()) { + key.extra_keys = std::make_unique< + std::vector>>(); + for (const auto& extra_key : config.extra_group_keys) { + std::string_view value = ev.args[extra_key].get(); + if (!value.empty()) { + key.extra_keys->emplace_back(intern.get_or_insert(extra_key), + intern.get_or_insert(value)); + } + } + } + + return key; +} + +void update_aggregation_entry(const DFTracerEvent& ev, + const AggregationConfig& config, + AggregationMap& aggregations, + const AggregationKey& key) { + auto it = aggregations.find(key); + if (it == aggregations.end()) { + it = aggregations + .emplace(key, AggregationMetrics(config.sketch_accuracy)) + .first; + } + auto& metrics = it->second; + + std::uint64_t ev_count = 0; + + if (ev.is_counter()) { + auto a_count = ev.args["dft_cnt"]; + if (!a_count.exists()) a_count = ev.args["count"]; + ev_count = a_count.exists() ? a_count.get() : 1; + metrics.count += ev_count; + + auto a_dur = ev.args["dur_sum"]; + if (!a_dur.exists()) a_dur = ev.args["dur"]; + auto a_dur_min = ev.args["dur_min"]; + if (!a_dur_min.exists()) a_dur_min = ev.args["dur"]; + auto a_dur_max = ev.args["dur_max"]; + if (!a_dur_max.exists()) a_dur_max = ev.args["dur"]; + apply_preaggregated_metric(metrics.duration, ev_count, a_dur, a_dur_min, + a_dur_max); + + auto a_size_sum = ev.args["ret_sum"]; + if (!a_size_sum.exists()) a_size_sum = ev.args["ret"]; + auto a_size_min = ev.args["ret_min"]; + if (!a_size_min.exists()) a_size_min = ev.args["ret"]; + auto a_size_max = ev.args["ret_max"]; + if (!a_size_max.exists()) a_size_max = ev.args["ret"]; + apply_preaggregated_metric(metrics.size, ev_count, a_size_sum, + a_size_min, a_size_max); + + metrics.update_timestamp(ev.ts, config.time_interval_us); + } else { + metrics.update_duration(ev.dur, config.compute_percentiles); + metrics.update_timestamp(ev.ts, ev.dur); + + auto ret = ev.args["ret"]; + if (ret.exists() && + internal::is_data_transfer_op(key.cat(), key.name())) { + std::uint64_t size = ret.get(); + metrics.update_size(size, config.compute_percentiles); + } + } + + auto track_metric_field = [&](std::string_view field) { + if (ev.is_counter()) { + std::string sum_key = std::string(field) + "_sum"; + auto a_sum = ev.args[sum_key]; + if (!a_sum.exists()) a_sum = ev.args[field]; + std::string min_key = std::string(field) + "_min"; + auto a_min = ev.args[min_key]; + if (!a_min.exists()) a_min = ev.args[field]; + std::string max_key = std::string(field) + "_max"; + auto a_max = ev.args[max_key]; + if (!a_max.exists()) a_max = ev.args[field]; + if (a_sum.exists() || a_min.exists() || a_max.exists()) { + if (!metrics.custom_metrics) { + metrics.custom_metrics = + std::make_unique(); + } + auto& cm = *metrics.custom_metrics; + auto cm_it = cm.find(field); + if (cm_it == cm.end()) { + cm_it = cm.emplace(std::string(field), + MetricStats(metrics.sketch_accuracy)) + .first; + } + apply_preaggregated_metric(cm_it->second, ev_count, a_sum, + a_min, a_max); + } + } else { + auto field_val = ev.args[field]; + if (field_val.exists()) { + std::uint64_t value = field_val.get(); + metrics.update_custom_metric(field, value, + config.compute_percentiles); + } + } + }; + + for (const auto& field : config.custom_metric_fields) { + track_metric_field(field); + } + + if (config.track_default_args) { + auto is_reserved = [](std::string_view k) { + return k == "hhash" || k == "fhash" || k == "dft_cnt" || + k == "dur" || k == "dur_sum" || k == "dur_min" || + k == "dur_max" || k == "ret" || k == "ret_sum" || + k == "ret_min" || k == "ret_max"; + }; + + auto is_preagg_suffix = [](std::string_view k) { + return k.size() > 4 && (k.substr(k.size() - 4) == "_sum" || + k.substr(k.size() - 4) == "_min" || + k.substr(k.size() - 4) == "_max"); + }; + + auto is_extra_group_key = [&](std::string_view k) { + for (const auto& gk : config.extra_group_keys) { + if (gk == k) return true; + } + return false; + }; + + auto is_custom_field = [&](std::string_view k) { + for (const auto& cf : config.custom_metric_fields) { + if (cf == k) return true; + } + return false; + }; + + ev.args.for_each_member([&](std::string_view k, ArgsValueProxy v) { + if (is_reserved(k) || is_extra_group_key(k) || is_custom_field(k)) + return; + if (ev.is_counter() && is_preagg_suffix(k)) return; + if (!v.is_number()) return; + track_metric_field(k); + }); + } +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp new file mode 100644 index 00000000..e318f48b --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp @@ -0,0 +1,54 @@ +#include +#include + +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +bool AggregationMergeOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + AggregationMetrics result; + + if (merge_in.existing_value) { + try { + result = deserialize_agg_value( + std::string_view(merge_in.existing_value->data(), + merge_in.existing_value->size())); + } catch (...) { + return false; + } + } + + for (const auto& operand : merge_in.operand_list) { + try { + auto other = deserialize_agg_value( + std::string_view(operand.data(), operand.size())); + result.merge_from(other); + } catch (...) { + return false; + } + } + + merge_out->new_value = serialize_agg_value(result); + return true; +} + +bool AggregationMergeOperator::PartialMerge( + const ::rocksdb::Slice& /*key*/, const ::rocksdb::Slice& left_operand, + const ::rocksdb::Slice& right_operand, std::string* new_value, + ::rocksdb::Logger* /*logger*/) const { + try { + auto left = deserialize_agg_value( + std::string_view(left_operand.data(), left_operand.size())); + auto right = deserialize_agg_value( + std::string_view(right_operand.data(), right_operand.size())); + left.merge_from(right); + *new_value = serialize_agg_value(left); + return true; + } catch (...) { + return false; + } +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp index 3dc0295e..76759b18 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp @@ -5,65 +5,54 @@ namespace dftracer::utils::utilities::composites::dft::aggregators { -void MetricStats::update(std::uint64_t value, std::uint64_t count, - bool compute_percentiles) { +// Representation note: +// count, total -> plain integer running sums (bit-exact regardless of +// merge order; overflow guarded by u64 range for typical +// trace magnitudes). +// m2, m3, m4 -> REPURPOSED. Now hold raw power sums: +// m2 = sum_x^2 +// m3 = sum_x^3 +// m4 = sum_x^4 +// Instead of Welford central moments. Merge becomes +// plain addition, making it commutative + associative. +// Integer-valued inputs with v^k representable in +// double mantissa (<= 2^52) keep additions exact, so +// serial and MPI outputs match bit-for-bit. Stddev / +// skewness / kurtosis are computed at read time by +// converting power sums to central moments. +// mean -> Not maintained incrementally; filled in at emit time +// by the aggregator from (total / count). +void MetricStats::update(std::uint64_t value, bool compute_percentiles) { + count++; total += value; if (value < min) min = value; if (value > max) max = value; - double n = static_cast(count); - double delta = static_cast(value) - mean; - double delta_n = delta / n; - double delta_n2 = delta_n * delta_n; - double term1 = delta * delta_n * (n - 1); - - m4 += term1 * delta_n2 * (n * n - 3 * n + 3) + 6 * delta_n2 * m2 - - 4 * delta_n * m3; - m3 += term1 * delta_n * (n - 2) - 3 * delta_n * m2; - m2 += term1; - mean += delta_n; + const double v = static_cast(value); + const double v2 = v * v; + m2 += v2; + m3 += v2 * v; + m4 += v2 * v2; + mean = static_cast(total) / static_cast(count); if (compute_percentiles) { if (!sketch) { sketch = std::make_unique(sketch_accuracy_); } - sketch->add(static_cast(value)); + sketch->add(v); } } -void MetricStats::merge_from(const MetricStats& other, std::uint64_t n1, - std::uint64_t n2, std::uint64_t n) { +void MetricStats::merge_from(const MetricStats& other) { + count += other.count; total += other.total; min = std::min(min, other.min); max = std::max(max, other.max); - - if (n > 0) { - double delta = other.mean - mean; - double delta2 = delta * delta; - double delta3 = delta * delta2; - double delta4 = delta2 * delta2; - - double n1_d = static_cast(n1); - double n2_d = static_cast(n2); - double n_d = static_cast(n); - - double mean_new = (n1_d * mean + n2_d * other.mean) / n_d; - - m4 = m4 + other.m4 + - delta4 * n1_d * n2_d * (n1_d * n1_d - n1_d * n2_d + n2_d * n2_d) / - (n_d * n_d * n_d) + - 6 * delta2 * (n1_d * n1_d * other.m2 + n2_d * n2_d * m2) / - (n_d * n_d) + - 4 * delta * (n1_d * other.m3 - n2_d * m3) / n_d; - - m3 = m3 + other.m3 + - delta3 * n1_d * n2_d * (n1_d - n2_d) / (n_d * n_d) + - 3 * delta * (n1_d * other.m2 - n2_d * m2) / n_d; - - m2 = m2 + other.m2 + delta2 * n1_d * n2_d / n_d; - - mean = mean_new; - } + m2 += other.m2; + m3 += other.m3; + m4 += other.m4; + mean = count > 0 ? static_cast(total) / static_cast(count) + : 0.0; if (other.sketch) { if (!sketch) { @@ -73,32 +62,59 @@ void MetricStats::merge_from(const MetricStats& other, std::uint64_t n1, } } -double MetricStats::get_stddev(std::uint64_t count) const { +// Convert power sums (m2=sum_x^2, m3=sum_x^3, m4=sum_x^4) and +// (count, total) into the central moments needed for stddev / skewness +// / kurtosis. Well-known identities: +// mu = total / n +// M2 = sum_x^2 - n * mu^2 +// M3 = sum_x^3 - 3 * mu * sum_x^2 + 2 * n * mu^3 +// M4 = sum_x^4 - 4 * mu * sum_x^3 + 6 * mu^2 * sum_x^2 - 3 * n * mu^4 +static void central_moments(std::uint64_t count, std::uint64_t total, double m2, + double m3, double m4, double& M2, double& M3, + double& M4, double& n, double& mu) { + n = static_cast(count); + mu = static_cast(total) / n; + M2 = m2 - n * mu * mu; + M3 = m3 - 3.0 * mu * m2 + 2.0 * n * mu * mu * mu; + M4 = m4 - 4.0 * mu * m3 + 6.0 * mu * mu * m2 - 3.0 * n * mu * mu * mu * mu; + // Rounding can push nonneg moments slightly negative. + if (M2 < 0.0) M2 = 0.0; + if (M4 < 0.0) M4 = 0.0; +} + +double MetricStats::get_stddev() const { if (count < 2) return 0.0; - return std::sqrt(m2 / static_cast(count - 1)); + double M2, M3, M4, n, mu; + central_moments(count, total, m2, m3, m4, M2, M3, M4, n, mu); + const double var = M2 / (n - 1.0); + return var > 0.0 ? std::sqrt(var) : 0.0; } -double MetricStats::get_skewness(std::uint64_t count) const { - if (count < 3 || m2 == 0.0) return 0.0; - double n = static_cast(count); - return std::sqrt(n) * m3 / std::pow(m2, 1.5); +double MetricStats::get_skewness() const { + if (count < 3) return 0.0; + double M2, M3, M4, n, mu; + central_moments(count, total, m2, m3, m4, M2, M3, M4, n, mu); + if (M2 == 0.0) return 0.0; + return std::sqrt(n) * M3 / std::pow(M2, 1.5); } -double MetricStats::get_kurtosis(std::uint64_t count) const { - if (count < 4 || m2 == 0.0) return 0.0; - double n = static_cast(count); - return n * m4 / (m2 * m2) - 3.0; +double MetricStats::get_kurtosis() const { + if (count < 4) return 0.0; + double M2, M3, M4, n, mu; + central_moments(count, total, m2, m3, m4, M2, M3, M4, n, mu); + if (M2 == 0.0) return 0.0; + return n * M4 / (M2 * M2) - 3.0; } void AggregationMetrics::update_duration(std::uint64_t dur, bool compute_percentiles) { count++; - duration.update(dur, count, compute_percentiles); + duration.update(dur, compute_percentiles); } void AggregationMetrics::update_size(std::uint64_t sz, bool compute_percentiles) { - size.update(sz, count, compute_percentiles); + size.update(sz, compute_percentiles); } void AggregationMetrics::update_timestamp(std::uint64_t event_ts, @@ -122,42 +138,26 @@ void AggregationMetrics::update_timestamp_clamped(std::uint64_t event_ts, if (clamped_te > te) te = clamped_te; } -void AggregationMetrics::update_custom_metric(const std::string& name, +void AggregationMetrics::update_custom_metric(std::string_view name, std::uint64_t value, bool compute_percentiles) { if (!custom_metrics) { custom_metrics = std::make_unique(); } - if (custom_metrics->find(name) == custom_metrics->end()) { - custom_metrics->emplace(name, MetricStats(sketch_accuracy)); - } - (*custom_metrics)[name].update(value, count, compute_percentiles); -} - -double AggregationMetrics::get_stddev_duration() const { - return duration.get_stddev(count); -} - -double AggregationMetrics::get_stddev_size() const { - return size.get_stddev(count); -} - -double AggregationMetrics::get_custom_stddev(const std::string& name) const { - if (!custom_metrics) return 0.0; auto it = custom_metrics->find(name); - if (it == custom_metrics->end()) return 0.0; - return it->second.get_stddev(count); + if (it == custom_metrics->end()) { + auto [new_it, _] = custom_metrics->emplace( + std::string(name), MetricStats(sketch_accuracy)); + it = new_it; + } + it->second.update(value, compute_percentiles); } void AggregationMetrics::merge_from(const AggregationMetrics& other) { - std::uint64_t n1 = count; - std::uint64_t n2 = other.count; - std::uint64_t n = n1 + n2; - - count = n; + count += other.count; - duration.merge_from(other.duration, n1, n2, n); - size.merge_from(other.size, n1, n2, n); + duration.merge_from(other.duration); + size.merge_from(other.size); ts = std::min(ts, other.ts); te = std::max(te, other.te); @@ -167,7 +167,13 @@ void AggregationMetrics::merge_from(const AggregationMetrics& other) { custom_metrics = std::make_unique(); } for (const auto& [name, other_metric] : *other.custom_metrics) { - (*custom_metrics)[name].merge_from(other_metric, n1, n2, n); + auto it = custom_metrics->find(name); + if (it == custom_metrics->end()) { + auto [new_it, _] = + custom_metrics->emplace(name, MetricStats(sketch_accuracy)); + it = new_it; + } + it->second.merge_from(other_metric); } } } diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp new file mode 100644 index 00000000..faa8b849 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp @@ -0,0 +1,453 @@ +#include +#include +#include +#include + +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +namespace { + +namespace hash = dftracer::utils::utilities::hash; + +using common::serialization::BinaryReader; +using common::serialization::put_be16; +using common::serialization::put_blob; +using common::serialization::put_double; +using common::serialization::put_str; +using common::serialization::put_u8; +using common::serialization::put_varint; +using common::serialization::write_double; +using common::serialization::write_str; +using common::serialization::write_varint; + +std::uint16_t compute_shard(std::string_view cat, std::string_view name, + std::uint64_t pid, std::uint64_t tid) { + struct Cache { + char cat_buf[64]; + char name_buf[64]; + std::size_t cat_len = SIZE_MAX; + std::size_t name_len = SIZE_MAX; + std::uint64_t pid = 0; + std::uint64_t tid = 0; + std::uint16_t shard = 0; + }; + thread_local Cache cache; + + if (cat.size() == cache.cat_len && name.size() == cache.name_len && + pid == cache.pid && tid == cache.tid && + std::memcmp(cache.cat_buf, cat.data(), cat.size()) == 0 && + std::memcmp(cache.name_buf, name.data(), name.size()) == 0) { + return cache.shard; + } + + hash::Fnv1aHashBuilder h; + h.update(cat); + h.update(name); + h.update_value(pid); + h.update_value(tid); + const auto shard = + static_cast(h.finish() % AGG_KEY_NUM_SHARDS); + + if (cat.size() <= sizeof(cache.cat_buf) && + name.size() <= sizeof(cache.name_buf)) { + std::memcpy(cache.cat_buf, cat.data(), cat.size()); + std::memcpy(cache.name_buf, name.data(), name.size()); + cache.cat_len = cat.size(); + cache.name_len = name.size(); + cache.pid = pid; + cache.tid = tid; + cache.shard = shard; + } else { + cache.cat_len = SIZE_MAX; + } + return shard; +} + +// Wire layout (FULL / FULL_WITH_SKETCH): +// fmt:u8, count:varint, total:varint, min:varint, max:varint, +// mean:f64, m2:f64, m3:f64, m4:f64, [sketch blob] +// m2/m3/m4 are raw power sums (sum_x^2/3/4); mean is redundantly persisted +// so consumers that don't need stddev can skip the power sums. +inline char* write_metric_stats(char* p, const MetricStats& ms) { + // COMPACT format can only represent "empty" (count=0) or a single + // event with value = total (count=1). Critically: count=1 total=0 is + // a VALID state (one event with value 0) that COMPACT cannot round- + // trip because the deserializer falls back to count=0 whenever the + // serialized varint is 0. Avoid COMPACT for that case. + const bool compact_empty = + ms.count == 0 && ms.total == 0 && ms.m2 == 0.0 && !ms.sketch; + const bool compact_single = ms.count == 1 && ms.total > 0 && + ms.m2 == static_cast(ms.total) * + static_cast(ms.total) && + !ms.sketch; + if (compact_empty || compact_single) { + *p++ = static_cast(METRIC_FMT_COMPACT); + return write_varint(p, ms.count == 0 ? 0 : ms.total); + } + *p++ = static_cast(METRIC_FMT_FULL); + p = write_varint(p, ms.count); + p = write_varint(p, ms.total); + p = write_varint(p, ms.min); + p = write_varint(p, ms.max); + p = write_double(p, ms.mean); + p = write_double(p, ms.m2); + // m3/m4 not persisted yet -- skewness/kurtosis recomputed in memory. + // p = write_double(p, ms.m3); + // p = write_double(p, ms.m4); + return p; +} + +// Upper bound for MetricStats (FULL fmt, no sketch): +// 1 (fmt) + 4*10 (varints) + 2*8 (doubles) = 57 bytes +constexpr std::size_t METRIC_STATS_MAX_BYTES_NO_SKETCH = 57; + +void serialize_metric_stats(std::string& out, const MetricStats& ms) { + if (!ms.sketch) { + const auto old_size = out.size(); + out.resize(old_size + METRIC_STATS_MAX_BYTES_NO_SKETCH); + char* begin = out.data() + old_size; + char* p = write_metric_stats(begin, ms); + out.resize(old_size + static_cast(p - begin)); + return; + } + put_u8(out, METRIC_FMT_FULL_WITH_SKETCH); + put_varint(out, ms.count); + put_varint(out, ms.total); + put_varint(out, ms.min); + put_varint(out, ms.max); + put_double(out, ms.mean); + put_double(out, ms.m2); + // m3/m4 not persisted yet. + // put_double(out, ms.m3); + // put_double(out, ms.m4); + auto blob = ms.sketch->serialize(); + put_blob(out, blob); +} + +MetricStats deserialize_metric_stats(BinaryReader& r, double accuracy) { + auto fmt = r.u8(); + if (fmt == METRIC_FMT_COMPACT) { + MetricStats ms(accuracy); + auto val = r.varint(); + if (val > 0) { + ms.count = 1; + ms.total = val; + ms.min = val; + ms.max = val; + ms.mean = static_cast(val); + const double v = static_cast(val); + ms.m2 = v * v; + // In-memory skewness/kurtosis only; not persisted: + // ms.m3 = v * v * v; + // ms.m4 = v * v * v * v; + } + return ms; + } + MetricStats ms(accuracy); + ms.count = r.varint(); + ms.total = r.varint(); + ms.min = r.varint(); + ms.max = r.varint(); + ms.mean = r.f64(); + ms.m2 = r.f64(); + // ms.m3 = r.f64(); + // ms.m4 = r.f64(); + if (fmt == METRIC_FMT_FULL_WITH_SKETCH) { + auto blob = r.blob(); + ms.sketch = std::make_unique(DDSketch::deserialize( + reinterpret_cast(blob.data()), blob.size())); + } + return ms; +} + +} // namespace + +void serialize_agg_key_into(std::string& out, std::uint32_t /*config_hash*/, + AggMapType map_type, const AggregationKey& key) { + out.clear(); + auto& intern = aggregation_intern(); + auto cat = intern.resolve(key.cat_id); + auto name = intern.resolve(key.name_id); + put_be16(out, compute_shard(cat, name, key.pid, key.tid)); + put_u8(out, static_cast(map_type)); + put_varint(out, key.cat_id); + put_varint(out, key.name_id); + put_varint(out, key.pid); + put_varint(out, key.tid); + put_varint(out, key.hhash_id); + put_varint(out, key.fhash_id); + put_varint(out, key.time_bucket); + std::uint16_t num_extra = + key.extra_keys ? static_cast(key.extra_keys->size()) : 0; + put_be16(out, num_extra); + if (key.extra_keys) { + for (const auto& [k, v] : *key.extra_keys) { + put_varint(out, k); + put_varint(out, v); + } + } +} + +void serialize_agg_key_into( + std::string& out, std::uint32_t /*config_hash*/, AggMapType map_type, + std::string_view cat, std::string_view name, std::uint64_t pid, + std::uint64_t tid, std::string_view hhash, std::string_view fhash, + std::uint64_t time_bucket, + const std::vector>* + extra_keys) { + auto& intern = aggregation_intern(); + const std::uint16_t shard = compute_shard(cat, name, pid, tid); + const std::uint16_t num_extra = + extra_keys ? static_cast(extra_keys->size()) : 0; + + // All fields are varints now — conservative upper bound + std::size_t total = 2 + 1 + 7 * 5 + 2 + num_extra * 2 * 5; + + out.clear(); + out.reserve(total); + + put_be16(out, shard); + out.push_back(static_cast(map_type)); + put_varint(out, intern.get_or_insert(cat)); + put_varint(out, intern.get_or_insert(name)); + put_varint(out, pid); + put_varint(out, tid); + put_varint(out, hhash.empty() ? 0 : intern.get_or_insert(hhash)); + put_varint(out, fhash.empty() ? 0 : intern.get_or_insert(fhash)); + put_varint(out, time_bucket); + put_be16(out, num_extra); + if (extra_keys) { + for (const auto& [k, v] : *extra_keys) { + put_varint(out, intern.get_or_insert(k)); + put_varint(out, intern.get_or_insert(v)); + } + } +} + +std::string serialize_agg_key(std::uint32_t config_hash, AggMapType map_type, + const AggregationKey& key) { + std::string out; + out.reserve(47); + serialize_agg_key_into(out, config_hash, map_type, key); + return out; +} + +DeserializedAggKey deserialize_agg_key(std::string_view data) { + BinaryReader r(data); + (void)r.be16(); + auto map_type = static_cast(r.u8()); + AggregationKey key; + key.cat_id = static_cast(r.varint()); + key.name_id = static_cast(r.varint()); + key.pid = r.varint(); + key.tid = r.varint(); + key.hhash_id = static_cast(r.varint()); + key.fhash_id = static_cast(r.varint()); + key.time_bucket = r.varint(); + auto num_extra = r.be16(); + if (num_extra > 0) { + key.extra_keys = std::make_unique< + std::vector>>(); + key.extra_keys->reserve(num_extra); + for (std::uint16_t i = 0; i < num_extra; ++i) { + auto k = static_cast(r.varint()); + auto v = static_cast(r.varint()); + key.extra_keys->emplace_back(k, v); + } + } + return {0, map_type, std::move(key)}; +} + +void serialize_agg_value_into(std::string& out, const AggregationMetrics& m) { + // Fast path: no sketches anywhere. Pre-size to a conservative upper + // bound and write directly via pointer, then shrink. + bool has_sketch = m.duration.sketch || m.size.sketch; + if (!has_sketch && m.custom_metrics) { + for (const auto& [_, ms] : *m.custom_metrics) { + if (ms.sketch) { + has_sketch = true; + break; + } + } + } + + if (!has_sketch) { + std::size_t custom_bytes = 0; + if (m.custom_metrics) { + for (const auto& [name, _] : *m.custom_metrics) { + custom_bytes += + 2 + name.size() + METRIC_STATS_MAX_BYTES_NO_SKETCH; + } + } + const std::size_t max_total = + 10 /*count*/ + METRIC_STATS_MAX_BYTES_NO_SKETCH /*dur*/ + + METRIC_STATS_MAX_BYTES_NO_SKETCH /*size*/ + 10 + 10 + + 10 /*ts/te/parent*/ + 10 /*num_custom*/ + custom_bytes; + out.resize(max_total); + char* begin = out.data(); + char* p = begin; + p = write_varint(p, m.count); + p = write_metric_stats(p, m.duration); + p = write_metric_stats(p, m.size); + p = write_varint(p, m.ts); + p = write_varint(p, m.te); + p = write_varint(p, m.parent_pid); + const std::uint32_t num_custom = + m.custom_metrics + ? static_cast(m.custom_metrics->size()) + : 0; + p = write_varint(p, num_custom); + if (m.custom_metrics) { + for (const auto& [name, ms] : *m.custom_metrics) { + p = write_str(p, name); + p = write_metric_stats(p, ms); + } + } + out.resize(static_cast(p - begin)); + return; + } + + out.clear(); + put_varint(out, m.count); + serialize_metric_stats(out, m.duration); + serialize_metric_stats(out, m.size); + put_varint(out, m.ts); + put_varint(out, m.te); + put_varint(out, m.parent_pid); + + std::uint32_t num_custom = + m.custom_metrics ? static_cast(m.custom_metrics->size()) + : 0; + put_varint(out, num_custom); + if (m.custom_metrics) { + for (const auto& [name, ms] : *m.custom_metrics) { + put_str(out, name); + serialize_metric_stats(out, ms); + } + } +} + +std::string serialize_agg_value(const AggregationMetrics& m) { + std::string out; + out.reserve(256); + serialize_agg_value_into(out, m); + return out; +} + +AggregationMetrics deserialize_agg_value(std::string_view data) { + BinaryReader r(data); + AggregationMetrics m; + m.count = r.varint(); + m.duration = deserialize_metric_stats(r, m.sketch_accuracy); + m.size = deserialize_metric_stats(r, m.sketch_accuracy); + m.ts = r.varint(); + m.te = r.varint(); + m.parent_pid = r.varint(); + + auto num_custom = r.varint(); + if (num_custom > 0) { + m.custom_metrics = std::make_unique(); + for (std::uint32_t i = 0; i < num_custom; ++i) { + auto name = r.str(); + auto ms = deserialize_metric_stats(r, m.sketch_accuracy); + m.custom_metrics->emplace(std::string(name), std::move(ms)); + } + } + return m; +} + +namespace { +std::atomic& intern_flushed_watermark() { + static std::atomic watermark{0}; + return watermark; +} +} // namespace + +void load_intern_dictionary(dftracer::utils::rocksdb::RocksDatabase& db) { + namespace rcf = dftracer::utils::rocksdb::cf; + auto& intern = aggregation_intern(); + auto it = db.new_iterator(rcf::AGGREGATION); + std::uint32_t max_id_plus_one = 0; + for (it->Seek({AGG_INTERN_DICT_PREFIX, AGG_INTERN_DICT_PREFIX_LEN}); + it->Valid(); it->Next()) { + auto key_slice = it->key(); + if (key_slice.size() < AGG_INTERN_DICT_PREFIX_LEN) break; + if (static_cast(key_slice[0]) != 0xFF || + static_cast(key_slice[1]) != 0xFD) + break; + + // Decode the id encoded as varint after the prefix. RocksDB key order + // is lex, which is NOT varint-numeric order past 127, so we cannot + // infer the id from iteration order. Read it explicitly. + common::serialization::BinaryReader key_reader( + std::string_view(key_slice.data() + AGG_INTERN_DICT_PREFIX_LEN, + key_slice.size() - AGG_INTERN_DICT_PREFIX_LEN)); + std::uint32_t id = 0; + try { + id = static_cast(key_reader.varint()); + } catch (const std::exception&) { + continue; + } + + auto val_slice = it->value(); + intern.insert_at_id( + id, std::string_view(val_slice.data(), val_slice.size())); + if (id + 1u > max_id_plus_one) max_id_plus_one = id + 1u; + } + intern_flushed_watermark().store(max_id_plus_one, + std::memory_order_relaxed); +} + +void flush_intern_dictionary( + dftracer::utils::rocksdb::RocksDatabase& db, + dftracer::utils::rocksdb::RocksDatabase::Batch& batch) { + namespace rcf = dftracer::utils::rocksdb::cf; + auto& intern = aggregation_intern(); + auto current = static_cast(intern.size()); + auto flushed = intern_flushed_watermark().load(std::memory_order_relaxed); + if (current <= flushed) return; + + for (std::uint32_t id = flushed; id < current; ++id) { + std::string key(AGG_INTERN_DICT_PREFIX, AGG_INTERN_DICT_PREFIX_LEN); + common::serialization::put_varint(key, id); + auto sv = intern.resolve(id); + db.put(batch, rcf::AGGREGATION, key, + std::string_view(sv.data(), sv.size())); + } + + // CAS to advance watermark; another thread may have already advanced it + while (flushed < current) { + if (intern_flushed_watermark().compare_exchange_weak( + flushed, current, std::memory_order_relaxed)) + break; + if (flushed >= current) break; + } +} + +void flush_intern_dictionary( + dftracer::utils::utilities::indexer::IndexBatchSink& sink) { + auto& intern = aggregation_intern(); + auto current = static_cast(intern.size()); + auto flushed = intern_flushed_watermark().load(std::memory_order_relaxed); + if (current <= flushed) return; + + std::string key; + for (std::uint32_t id = flushed; id < current; ++id) { + key.assign(AGG_INTERN_DICT_PREFIX, AGG_INTERN_DICT_PREFIX_LEN); + common::serialization::put_varint(key, id); + auto sv = intern.resolve(id); + sink.insert_aggregation_put(key, + std::string_view(sv.data(), sv.size())); + } + + while (flushed < current) { + if (intern_flushed_watermark().compare_exchange_weak( + flushed, current, std::memory_order_relaxed)) + break; + if (flushed >= current) break; + } +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp new file mode 100644 index 00000000..5db29fcb --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp @@ -0,0 +1,461 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +namespace rcf = dftracer::utils::rocksdb::cf; + +namespace { + +inline bool is_reserved_arg(std::string_view k) { + if (k.empty()) return false; + switch (k[0]) { + case 'h': + return k == "hhash"; + case 'f': + return k == "fhash"; + case 'd': + return k == "dur" || k == "dur_sum" || k == "dur_min" || + k == "dur_max" || k == "dft_cnt"; + case 'r': + return k == "ret" || k == "ret_sum" || k == "ret_min" || + k == "ret_max"; + } + return false; +} + +inline bool is_preagg_suffix(std::string_view k) { + if (k.size() <= 4) return false; + std::string_view tail = k.substr(k.size() - 4); + return tail == "_sum" || tail == "_min" || tail == "_max"; +} + +} // namespace + +namespace { + +/// Derive a unique per-file batch_id from a staging prefix + the file +/// path. Uses FNV1a so concurrent visitors processing different files +/// land in disjoint subdirectories under the staging root. +std::string make_per_file_batch_id(std::string_view prefix, + std::string_view file_path) { + std::uint64_t fnv_basis = 1469598103934665603ULL; + std::uint64_t fnv_prime = 1099511628211ULL; + std::uint64_t h = fnv_basis; + for (unsigned char c : file_path) { + h ^= c; + h *= fnv_prime; + } + char hex[17]; + std::snprintf(hex, sizeof(hex), "%016llx", + static_cast(h)); + std::string out; + out.reserve(prefix.size() + 1 + 16); + out.append(prefix); + out.push_back('_'); + out.append(hex, 16); + return out; +} + +} // namespace + +AggregationVisitor::AggregationVisitor( + std::shared_ptr db, std::uint32_t config_hash, + AggregationConfig config, std::string file_path) + : db_(std::move(db)), + config_hash_(config_hash), + config_(std::move(config)), + file_path_(std::move(file_path)) { + if (config_.track_process_parents || !config_.boundary_events.empty()) { + tracker_ = std::make_shared(); + } + local_buffer_.reserve(65536); + key_buf_.reserve(128); + val_buf_.reserve(256); +} + +AggregationVisitor::AggregationVisitor(std::string staging_dir, + std::string batch_id_prefix, + std::uint32_t config_hash, + AggregationConfig config, + std::string file_path) + : sst_staging_dir_(std::move(staging_dir)), + sst_batch_prefix_(make_per_file_batch_id(batch_id_prefix, file_path)), + config_hash_(config_hash), + config_(std::move(config)), + file_path_(std::move(file_path)) { + if (config_.track_process_parents || !config_.boundary_events.empty()) { + tracker_ = std::make_shared(); + } + local_buffer_.reserve(65536); + key_buf_.reserve(128); + val_buf_.reserve(256); + // First SST writer; rotated after each flush in seal_local_buffer. + sst_sink_ = std::make_unique( + sst_staging_dir_, + sst_batch_prefix_ + "_" + std::to_string(sst_flush_counter_++)); +} + +void AggregationVisitor::begin(std::size_t /*num_checkpoints*/) {} + +void AggregationVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {} + +void AggregationVisitor::on_event(const EventRecord& record) { + const auto& ev = record.ev; + if (ev.is_metadata()) { + return; + } + + if (tracker_) { + tracker_->extract_from_event(ev.name, ev.pid, ev.ts, ev.dur, ev.args, + config_); + } + + bool is_preaggregated_system = false; + if (ev.is_system()) { + auto cnt = ev.args["count"]; + auto dft_cnt = ev.args["dft_cnt"]; + bool is_preaggregated = cnt.is_number() || dft_cnt.is_number(); + if (!is_preaggregated) { + handle_system_event(record); + return; + } + is_preaggregated_system = true; + } + + AggMapType map_type = AggMapType::EVENT; + if (is_preaggregated_system) { + map_type = AggMapType::SYSTEM; + } else if (ev.is_profile()) { + map_type = AggMapType::PROFILE; + } + + auto hhash = ev.args["hhash"].get(); + auto fhash = ev.args["fhash"].get(); + auto time_bucket = compute_time_bucket(ev.ts, ev.dur, config_); + + if (time_bucket < min_time_bucket_) min_time_bucket_ = time_bucket; + if (time_bucket > max_time_bucket_) max_time_bucket_ = time_bucket; + + std::vector> extra_keys_vec; + std::vector>* extra_ptr = + nullptr; + if (!config_.extra_group_keys.empty()) { + for (const auto& extra_key : config_.extra_group_keys) { + auto value = ev.args[extra_key].get(); + if (!value.empty()) { + extra_keys_vec.emplace_back(extra_key, value); + observed_extra_keys_.emplace(extra_key); + } + } + if (!extra_keys_vec.empty()) extra_ptr = &extra_keys_vec; + } + + serialize_agg_key_into(key_buf_, config_hash_, map_type, ev.cat, ev.name, + ev.pid, ev.tid, hhash, fhash, time_bucket, + extra_ptr); + + AggregationMetrics* entry_ptr; + if (last_entry_ != nullptr && last_key_ == key_buf_) { + entry_ptr = last_entry_; + } else { + auto [it, inserted] = + local_buffer_.try_emplace(key_buf_, config_.sketch_accuracy); + entry_ptr = &it->second; + last_entry_ = entry_ptr; + last_key_ = it->first; + } + auto& entry = *entry_ptr; + const bool compute_percentiles = config_.compute_percentiles; + + std::uint64_t ev_count = 1; + if (ev.is_counter()) { + auto a_count = ev.args["dft_cnt"]; + if (!a_count.exists()) a_count = ev.args["count"]; + ev_count = a_count.exists() ? a_count.get() : 1; + entry.count += ev_count; + + auto a_dur = ev.args["dur_sum"]; + if (!a_dur.exists()) a_dur = ev.args["dur"]; + if (a_dur.exists()) { + MetricStats tmp(config_.sketch_accuracy); + tmp.count = ev_count; + tmp.total = a_dur.get(); + auto a_dur_min = ev.args["dur_min"]; + if (!a_dur_min.exists()) a_dur_min = a_dur; + tmp.min = a_dur_min.get(); + auto a_dur_max = ev.args["dur_max"]; + if (!a_dur_max.exists()) a_dur_max = a_dur; + tmp.max = a_dur_max.get(); + if (tmp.count > 0) { + tmp.mean = static_cast(tmp.total) / + static_cast(tmp.count); + } + entry.duration.merge_from(tmp); + } + + auto a_size = ev.args["ret_sum"]; + if (!a_size.exists()) a_size = ev.args["ret"]; + if (a_size.exists()) { + MetricStats tmp(config_.sketch_accuracy); + tmp.count = ev_count; + tmp.total = a_size.get(); + auto a_min = ev.args["ret_min"]; + if (!a_min.exists()) a_min = a_size; + tmp.min = a_min.get(); + auto a_max = ev.args["ret_max"]; + if (!a_max.exists()) a_max = a_size; + tmp.max = a_max.get(); + if (tmp.count > 0) { + tmp.mean = static_cast(tmp.total) / + static_cast(tmp.count); + } + entry.size.merge_from(tmp); + } + + entry.update_timestamp(ev.ts, config_.time_interval_us); + } else { + entry.update_duration(ev.dur, compute_percentiles); + entry.update_timestamp(ev.ts, ev.dur); + + auto ret = ev.args["ret"]; + if (ret.exists() && internal::is_data_transfer_op(ev.cat, ev.name)) { + entry.update_size(ret.get(), compute_percentiles); + } + } + + if (config_.track_default_args) { + const bool is_counter_ev = ev.is_counter(); + ev.args.for_each_member([&](std::string_view k, ArgsValueProxy v) { + if (!v.is_number()) return; + if (is_reserved_arg(k)) return; + if (is_counter_ev && is_preagg_suffix(k)) return; + for (const auto& gk : config_.extra_group_keys) { + if (gk == k) return; + } + for (const auto& cf : config_.custom_metric_fields) { + if (cf == k) return; + } + entry.update_custom_metric(k, v.get(), + compute_percentiles); + }); + } + + for (const auto& field : config_.custom_metric_fields) { + if (ev.is_counter()) { + std::string sum_key = std::string(field) + "_sum"; + auto a_sum = ev.args[sum_key]; + if (!a_sum.exists()) a_sum = ev.args[field]; + std::string min_key = std::string(field) + "_min"; + auto a_min = ev.args[min_key]; + if (!a_min.exists()) a_min = ev.args[field]; + std::string max_key = std::string(field) + "_max"; + auto a_max = ev.args[max_key]; + if (!a_max.exists()) a_max = ev.args[field]; + if (a_sum.exists() && a_sum.is_number()) { + if (!entry.custom_metrics) { + entry.custom_metrics = std::make_unique(); + } + auto& cm = *entry.custom_metrics; + auto cm_it = cm.find(field); + if (cm_it == cm.end()) { + cm_it = cm.emplace(std::string(field), + MetricStats(config_.sketch_accuracy)) + .first; + } + auto& stats = cm_it->second; + stats.count += ev_count; + stats.total += a_sum.get(); + if (a_min.exists() && a_min.is_number()) { + stats.min = std::min(stats.min, a_min.get()); + } + if (a_max.exists() && a_max.is_number()) { + stats.max = std::max(stats.max, a_max.get()); + } + if (stats.count > 0) { + stats.mean = static_cast(stats.total) / + static_cast(stats.count); + } + observed_custom_metrics_.insert(field); + } + } else { + auto field_val = ev.args[field]; + if (field_val.exists() && field_val.is_number()) { + entry.update_custom_metric( + field, field_val.get(), compute_percentiles); + } + } + } + + events_processed_++; + + if (local_buffer_.size() >= FLUSH_THRESHOLD) { + seal_local_buffer(); + } +} + +void AggregationVisitor::handle_system_event(const EventRecord& record) { + const auto& ev = record.ev; + + auto hhash = ev.args["hhash"].get(); + auto time_bucket = compute_time_bucket(ev.ts, ev.dur, config_); + + if (time_bucket < min_time_bucket_) min_time_bucket_ = time_bucket; + if (time_bucket > max_time_bucket_) max_time_bucket_ = time_bucket; + + serialize_system_key_into(system_key_buf_, hhash, time_bucket); + + auto [it, inserted] = + system_buffer_.try_emplace(system_key_buf_, config_.sketch_accuracy); + auto& entry = it->second; + + entry.count++; + entry.update_timestamp(ev.ts); + + const bool compute_percentiles = config_.compute_percentiles; + + ev.args.for_each_member([&](std::string_view k, ArgsValueProxy v) { + if (!v.is_number()) return; + if (k == "hhash" || k == "fhash") return; + + double val = v.get(); + entry.update_metric(k, val, compute_percentiles); + observed_system_metrics_.insert(std::string(k)); + }); + + events_processed_++; + + if (system_buffer_.size() >= FLUSH_THRESHOLD) { + seal_local_buffer(); + } +} + +void AggregationVisitor::seal_local_buffer() { + if (local_buffer_.empty() && system_buffer_.empty()) return; + + for (const auto& [key, metrics] : local_buffer_) { + if (metrics.custom_metrics) { + for (const auto& [name, _] : *metrics.custom_metrics) { + observed_custom_metrics_.insert(name); + } + } + } + + if (sst_sink_) { + // Distributed mode: flush the current in-memory maps into the + // active per-flush SstWriterContext, then rotate to a fresh one + // so the next flush (or on_file_complete) writes to its own SST + // with a fresh, strictly-ascending key space. + for (auto& [k, m] : local_buffer_) { + serialize_agg_value_into(val_buf_, m); + sst_sink_->insert_aggregation_merge(k, val_buf_); + } + local_buffer_.clear(); + last_entry_ = nullptr; + last_key_ = {}; + + for (auto& [k, m] : system_buffer_) { + serialize_system_value_into(system_val_buf_, m); + sst_sink_->insert_system_metrics_merge(k, system_val_buf_); + } + system_buffer_.clear(); + + flush_intern_dictionary(*sst_sink_); + + // Commit this flush's SSTs and open a new SstWriterContext for + // the next flush. Only rotate if something was actually written; + // an empty commit produces no paths and no-ops. + auto a = sst_sink_->commit(); + if (!a.empty()) sst_artifacts_.push_back(std::move(a)); + sst_sink_ = std::make_unique( + sst_staging_dir_, + sst_batch_prefix_ + "_" + std::to_string(sst_flush_counter_++)); + return; + } + + // Legacy mode: flush to a RocksDatabase batch; commit at + // on_file_complete. + if (!db_) return; + auto batch = db_->begin_batch(); + for (auto& [k, m] : local_buffer_) { + serialize_agg_value_into(val_buf_, m); + db_->merge(batch, rcf::AGGREGATION, k, val_buf_); + } + local_buffer_.clear(); + last_entry_ = nullptr; + last_key_ = {}; + + for (auto& [k, m] : system_buffer_) { + serialize_system_value_into(system_val_buf_, m); + db_->merge(batch, rcf::SYSTEM_METRICS, k, system_val_buf_); + } + system_buffer_.clear(); + + flush_intern_dictionary(*db_, batch); + pending_batches_.push_back(std::move(batch)); +} + +coro::CoroTask AggregationVisitor::on_file_complete() { + seal_local_buffer(); + + if (sst_sink_) { + // Commit any final residue (the rotated-to-fresh SstWriterContext + // that seal_local_buffer left behind). An empty commit returns + // empty paths which we skip. + auto a = sst_sink_->commit(); + if (!a.empty()) sst_artifacts_.push_back(std::move(a)); + sst_sink_.reset(); + co_return; + } + + if (pending_batches_.empty()) co_return; + for (auto& batch : pending_batches_) { + db_->commit_batch(batch); + } + pending_batches_.clear(); +} + +void AggregationVisitor::flush_to_batch(rocksdb::RocksDatabase::Batch& batch) { + // Legacy-only helper, used by aggregator_utility for draining a + // batch-write phase. SST mode never calls this. + if (!db_) return; + for (auto& [k, m] : local_buffer_) { + serialize_agg_value_into(val_buf_, m); + db_->merge(batch, rcf::AGGREGATION, k, val_buf_); + } + local_buffer_.clear(); + + for (auto& [k, m] : system_buffer_) { + serialize_system_value_into(system_val_buf_, m); + db_->merge(batch, rcf::SYSTEM_METRICS, k, system_val_buf_); + } + system_buffer_.clear(); + + flush_intern_dictionary(*db_, batch); +} + +ChunkAggregationOutput AggregationVisitor::take_output() { + if (tracker_) { + tracker_->finalize(); + } + + ChunkAggregationOutput output; + output.file_path = std::move(file_path_); + output.events_processed = events_processed_; + output.success = true; + output.local_tracker = std::move(tracker_); + output.min_time_bucket = min_time_bucket_; + output.max_time_bucket = max_time_bucket_; + + return output; +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp index be37ccd2..3fae1eca 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp @@ -1,9 +1,9 @@ +#include #include #include #include #include -#include namespace dftracer::utils::utilities::composites::dft::aggregators { @@ -21,9 +21,15 @@ coro::CoroTask AggregatorSummaryUtility::process( std::printf("Total events aggregated: %llu\n", static_cast(total_events)); - std::unordered_map category_counts; + StringViewMap category_counts; for (const auto& [key, metrics] : aggregations) { - category_counts[std::string(key.cat())] += metrics.count; + auto cat = key.cat(); + auto it = category_counts.find(cat); + if (it == category_counts.end()) { + category_counts.emplace(std::string(cat), metrics.count); + } else { + it->second += metrics.count; + } } std::printf("\nEvents by category:\n"); diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp index 465d4029..475d2913 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp @@ -1,26 +1,28 @@ #include #include +#include +#include +#include #include +#include +#include #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW #include #endif -#include +#include #include #include -#include #include +#include namespace dftracer::utils::utilities::composites::dft::aggregators { @@ -53,13 +55,8 @@ AggregatorInput& AggregatorInput::with_force_rebuild(bool force) { return *this; } -AggregatorInput& AggregatorInput::with_chunk_size_mb(std::size_t mb) { - chunk_size_mb = mb; - return *this; -} - -AggregatorInput& AggregatorInput::with_batch_size_mb(std::size_t mb) { - batch_size_mb = mb; +AggregatorInput& AggregatorInput::with_parallelism(std::size_t n) { + parallelism = n; return *this; } @@ -80,25 +77,35 @@ using common::arrow::RecordBatchBuilder; ArrowExportResult AggregationBatch::to_arrow() const { RecordBatchBuilder builder; - // Discover the union of extra key IDs and custom metric names. - std::set extra_key_id_set; - std::set> custom_metric_name_set; - for (const auto& [key, metrics] : entries) { - if (key.extra_keys && !key.extra_keys->empty()) { - for (const auto& [k, v] : *key.extra_keys) { - extra_key_id_set.insert(k); + // Use precomputed global columns if available, otherwise discover locally. + std::vector local_extra_key_ids; + std::vector local_custom_metric_names; + if (!global_extra_key_ids || !global_custom_metric_names) { + std::set extra_key_id_set; + std::set> custom_metric_name_set; + for (const auto& entry : entries) { + if (entry.key.extra_keys && !entry.key.extra_keys->empty()) { + for (const auto& [k, v] : *entry.key.extra_keys) { + extra_key_id_set.insert(k); + } } - } - if (metrics.custom_metrics && !metrics.custom_metrics->empty()) { - for (const auto& [name, _] : *metrics.custom_metrics) { - custom_metric_name_set.insert(name); + if (entry.metrics.custom_metrics && + !entry.metrics.custom_metrics->empty()) { + for (const auto& [name, _] : *entry.metrics.custom_metrics) { + custom_metric_name_set.insert(name); + } } } + local_extra_key_ids.assign(extra_key_id_set.begin(), + extra_key_id_set.end()); + local_custom_metric_names.assign(custom_metric_name_set.begin(), + custom_metric_name_set.end()); } - std::vector extra_key_ids(extra_key_id_set.begin(), - extra_key_id_set.end()); - std::vector custom_metric_names( - custom_metric_name_set.begin(), custom_metric_name_set.end()); + const auto& extra_key_ids = + global_extra_key_ids ? *global_extra_key_ids : local_extra_key_ids; + const auto& custom_metric_names = global_custom_metric_names + ? *global_custom_metric_names + : local_custom_metric_names; // Build schema: batch_type + fixed columns + extra keys + custom metrics std::vector schema = { @@ -114,11 +121,16 @@ ArrowExportResult AggregationBatch::to_arrow() const { {"size_std", ColumnType::DOUBLE}, {"ts", ColumnType::UINT64}, {"te", ColumnType::UINT64}, }; + // Add CI columns when batch has approximated entries + if (has_approximated_entries) { + schema.push_back({"count_ci_lower", ColumnType::DOUBLE}); + schema.push_back({"count_ci_upper", ColumnType::DOUBLE}); + } for (auto id : extra_key_ids) { schema.push_back({std::string(aggregation_intern().resolve(id)), ColumnType::STRING}); } - // Custom metric suffixed names — need owned strings for ColumnSpec + // Custom metric suffixed names struct MetricSuffix { const char* suffix; ColumnType type; @@ -141,7 +153,9 @@ ArrowExportResult AggregationBatch::to_arrow() const { builder.declare_schema(schema); builder.reserve(entries.size()); - for (const auto& [key, metrics] : entries) { + for (const auto& entry : entries) { + const auto& key = entry.key; + const auto& metrics = entry.metrics; std::size_t ci = 0; builder.append_int64(ci++, static_cast(batch_type)); builder.append_string(ci++, key.cat()); @@ -157,12 +171,12 @@ ArrowExportResult AggregationBatch::to_arrow() const { metrics.count > 0 ? metrics.duration.min : 0); builder.append_uint64(ci++, metrics.duration.max); builder.append_double(ci++, metrics.duration.mean); - builder.append_double(ci++, metrics.get_stddev_duration()); + builder.append_double(ci++, metrics.duration.get_stddev()); builder.append_uint64(ci++, metrics.size.total); builder.append_uint64(ci++, metrics.count > 0 ? metrics.size.min : 0); builder.append_uint64(ci++, metrics.size.max); builder.append_double(ci++, metrics.size.mean); - builder.append_double(ci++, metrics.get_stddev_size()); + builder.append_double(ci++, metrics.size.get_stddev()); builder.append_uint64(ci++, metrics.ts); builder.append_uint64(ci++, metrics.te); @@ -192,7 +206,7 @@ ArrowExportResult AggregationBatch::to_arrow() const { builder.append_uint64(ci++, metrics.count > 0 ? ms.min : 0); builder.append_uint64(ci++, ms.max); builder.append_double(ci++, ms.mean); - builder.append_double(ci++, ms.get_stddev(metrics.count)); + builder.append_double(ci++, ms.get_stddev()); continue; } } @@ -200,187 +214,513 @@ ArrowExportResult AggregationBatch::to_arrow() const { builder.append_null(ci++); } + // Add CI columns when batch has approximated entries + if (has_approximated_entries) { + builder.append_double(ci++, entry.count_ci.lower); + builder.append_double(ci++, entry.count_ci.upper); + } + builder.end_row(); } return builder.finish(); } + +// --------------------------------------------------------------------------- +// AggregationBatch::to_dfanalyzer_arrow +// --------------------------------------------------------------------------- + +namespace { + +// IO category constants matching dfanalyzer IOCategory enum +enum class IOCategory : std::int8_t { + READ = 1, + WRITE = 2, + METADATA = 3, + PCTL = 4, + IPC = 5, + OTHER = 6, + SYNC = 7, +}; + +IOCategory get_io_category(std::string_view func_name) { + // Read functions + if (func_name == "read" || func_name == "pread" || func_name == "readv" || + func_name == "preadv" || func_name == "fread") { + return IOCategory::READ; + } + // Write functions + if (func_name == "write" || func_name == "pwrite" || + func_name == "writev" || func_name == "pwritev" || + func_name == "fwrite") { + return IOCategory::WRITE; + } + // Sync functions + if (func_name == "fsync" || func_name == "fdatasync" || + func_name == "msync" || func_name == "sync") { + return IOCategory::SYNC; + } + // Metadata functions + if (func_name == "open" || func_name == "open64" || func_name == "close" || + func_name == "fopen" || func_name == "fopen64" || + func_name == "fclose" || func_name == "stat" || func_name == "fstat" || + func_name == "lstat" || func_name == "fstatat" || + func_name == "__xstat" || func_name == "__xstat64" || + func_name == "__lxstat" || func_name == "__lxstat64" || + func_name == "__fxstat" || func_name == "__fxstat64" || + func_name == "access" || func_name == "lseek" || + func_name == "lseek64" || func_name == "fseek" || + func_name == "ftell" || func_name == "seek" || func_name == "fcntl" || + func_name == "ftruncate" || func_name == "mkdir" || + func_name == "rmdir" || func_name == "unlink" || + func_name == "remove" || func_name == "rename" || func_name == "link" || + func_name == "readlink" || func_name == "opendir" || + func_name == "closedir" || func_name == "readdir") { + return IOCategory::METADATA; + } + return IOCategory::OTHER; +} + +std::string resolve_hash( + const std::unordered_map* hash_table, + std::string_view hash) { + if (!hash_table || hash.empty()) return std::string(hash); + auto it = hash_table->find(std::string(hash)); + if (it != hash_table->end()) return it->second; + return std::string(hash); +} + +std::string build_proc_name(std::string_view host_name, std::string_view hhash, + std::uint64_t pid, std::uint64_t tid) { + std::string result = "app#"; + if (!host_name.empty()) { + result.append(host_name); + } else if (!hhash.empty()) { + result.append(hhash); + } else { + result.append("unknown"); + } + result.push_back('#'); + result.append(std::to_string(pid)); + result.push_back('#'); + result.append(std::to_string(tid)); + return result; +} + +} // namespace + +ArrowExportResult AggregationBatch::to_dfanalyzer_arrow( + const DfanalyzerContext& ctx) const { + RecordBatchBuilder builder; + + // Bucket width in microseconds + auto bucket_width_us = + static_cast(ctx.time_granularity * ctx.time_resolution); + + if (batch_type == AggregationBatchType::SYSTEM) { + // System metrics schema + std::vector schema = { + {"host_hash", ColumnType::STRING}, + {"time_range", ColumnType::INT64}, + {"sys_cpu_iowait_pct", ColumnType::DOUBLE}, + {"sys_cpu_user_pct", ColumnType::DOUBLE}, + {"sys_cpu_system_pct", ColumnType::DOUBLE}, + {"sys_cpu_idle_pct", ColumnType::DOUBLE}, + {"sys_core_iowait_pct_max", ColumnType::DOUBLE}, + {"sys_core_iowait_pct_p95", ColumnType::DOUBLE}, + {"sys_mem_dirty", ColumnType::DOUBLE}, + {"sys_mem_cached", ColumnType::DOUBLE}, + {"sys_mem_available", ColumnType::DOUBLE}, + }; + builder.declare_schema(schema); + builder.reserve(entries.size()); + + for (const auto& entry : entries) { + const auto& key = entry.key; + const auto& metrics = entry.metrics; + std::size_t ci = 0; + + builder.append_string(ci++, key.hhash()); + auto time_range = + bucket_width_us > 0 + ? static_cast( + (key.time_bucket - ctx.time_origin) / bucket_width_us) + : 0; + builder.append_int64(ci++, time_range); + + // Extract system metrics from custom_metrics + auto get_metric = [&](const char* name) -> double { + if (!metrics.custom_metrics) return 0.0; + auto it = metrics.custom_metrics->find(name); + if (it == metrics.custom_metrics->end()) return 0.0; + return it->second.mean; + }; + auto get_metric_max = [&](const char* name) -> double { + if (!metrics.custom_metrics) return 0.0; + auto it = metrics.custom_metrics->find(name); + if (it == metrics.custom_metrics->end()) return 0.0; + return static_cast(it->second.max); + }; + + builder.append_double(ci++, get_metric("iowait_pct")); + builder.append_double(ci++, get_metric("user_pct")); + builder.append_double(ci++, get_metric("system_pct")); + builder.append_double(ci++, get_metric("idle_pct")); + builder.append_double(ci++, get_metric_max("iowait_pct")); + builder.append_double(ci++, + get_metric("iowait_pct")); // p95 approx + builder.append_double(ci++, get_metric("Dirty")); + builder.append_double(ci++, get_metric("Cached")); + builder.append_double(ci++, get_metric("MemAvailable")); + + builder.end_row(); + } + } else { + // Events/Profiles schema + std::vector schema = { + {"cat", ColumnType::STRING}, + {"func_name", ColumnType::STRING}, + {"pid", ColumnType::INT64}, + {"tid", ColumnType::INT64}, + {"file_hash", ColumnType::STRING}, + {"host_hash", ColumnType::STRING}, + {"file_name", ColumnType::STRING}, + {"host_name", ColumnType::STRING}, + {"proc_name", ColumnType::STRING}, + {"io_cat", ColumnType::INT64}, + {"acc_pat", ColumnType::INT64}, + {"count", ColumnType::INT64}, + {"time", ColumnType::DOUBLE}, + {"size", ColumnType::INT64}, + {"time_min", ColumnType::DOUBLE}, + {"time_max", ColumnType::DOUBLE}, + {"size_min", ColumnType::INT64}, + {"size_max", ColumnType::INT64}, + {"time_range", ColumnType::INT64}, + {"time_start", ColumnType::INT64}, + {"time_end", ColumnType::INT64}, + }; + builder.declare_schema(schema); + builder.reserve(entries.size()); + + for (const auto& entry : entries) { + const auto& key = entry.key; + const auto& metrics = entry.metrics; + std::size_t ci = 0; + + auto fhash = key.fhash(); + auto hhash = key.hhash(); + auto file_name = resolve_hash(ctx.file_hashes, fhash); + auto host_name = resolve_hash(ctx.host_hashes, hhash); + auto proc_name = + build_proc_name(host_name, hhash, key.pid, key.tid); + auto io_cat = get_io_category(key.name()); + + builder.append_string(ci++, key.cat()); + builder.append_string(ci++, key.name()); + builder.append_int64(ci++, static_cast(key.pid)); + builder.append_int64(ci++, static_cast(key.tid)); + builder.append_string(ci++, fhash); + builder.append_string(ci++, hhash); + builder.append_string(ci++, file_name); + builder.append_string(ci++, host_name); + builder.append_string(ci++, proc_name); + builder.append_int64(ci++, static_cast(io_cat)); + builder.append_int64(ci++, 0); // acc_pat always 0 + + builder.append_int64(ci++, + static_cast(metrics.count)); + // time: duration in seconds (dur_total is in us) + builder.append_double(ci++, + static_cast(metrics.duration.total) / + ctx.time_resolution); + // size: nullable (0 means null) + if (metrics.size.total > 0) { + builder.append_int64( + ci++, static_cast(metrics.size.total)); + } else { + builder.append_null(ci++); + } + // time_min/max in seconds + builder.append_double( + ci++, metrics.count > 0 + ? static_cast(metrics.duration.min) / + ctx.time_resolution + : 0.0); + builder.append_double(ci++, + static_cast(metrics.duration.max) / + ctx.time_resolution); + // size_min/max: nullable + if (metrics.size.total > 0 && metrics.count > 0) { + builder.append_int64( + ci++, static_cast(metrics.size.min)); + builder.append_int64( + ci++, static_cast(metrics.size.max)); + } else { + builder.append_null(ci++); + builder.append_null(ci++); + } + + // time_range: normalized bucket index + auto time_range = + bucket_width_us > 0 + ? static_cast( + (key.time_bucket - ctx.time_origin) / bucket_width_us) + : 0; + builder.append_int64(ci++, time_range); + // time_start/end: relative to time_origin (still in us) + builder.append_int64( + ci++, static_cast(metrics.ts - ctx.time_origin)); + builder.append_int64( + ci++, static_cast(metrics.te - ctx.time_origin)); + + builder.end_row(); + } + } + + return builder.finish(); +} #endif // DFTRACER_UTILS_ENABLE_ARROW // --------------------------------------------------------------------------- -// AggregatorUtility::process +// AggregatorUtility::process - parallel, RocksDB-backed, fused pipeline // --------------------------------------------------------------------------- coro::AsyncGenerator AggregatorUtility::process( const AggregatorInput& input) { - // Resolve index directory — create a temp one if not specified. - std::string effective_index_dir = input.index_dir; - std::string temp_index_dir; - if (effective_index_dir.empty()) { - try { - auto temp_path = fs::temp_directory_path(); - temp_path /= "dftracer_idx_" + std::to_string(std::time(nullptr)) + - "_" + std::to_string(getpid()); - temp_index_dir = temp_path.string(); - fs::create_directories(temp_index_dir); - } catch (const fs::filesystem_error&) { - temp_index_dir = "/tmp/dftracer_idx_" + - std::to_string(std::time(nullptr)) + "_" + - std::to_string(getpid()); - fs::create_directories(temp_index_dir); - } - effective_index_dir = temp_index_dir; + if (!has_context()) { + DFTRACER_UTILS_LOG_ERROR( + "AggregatorUtility requires CoroScope context. " + "Use Runtime::scope() to run this utility."); + co_return; } + CoroScope& scope = context(); - // Discover input files. - filesystem::PatternDirectoryScannerUtility scanner; - filesystem::PatternDirectoryScannerUtilityInput scan_input{ - input.directory, {".pfw", ".pfw.gz"}, false}; - auto matched_entries = co_await scanner.process(scan_input); - - std::vector input_files; - input_files.reserve(matched_entries.size()); - for (const auto& entry : matched_entries) { - input_files.push_back(entry.path.string()); + // Determine parallelism + std::size_t parallelism = input.parallelism; + if (parallelism == 0) { + parallelism = dftracer_utils_hardware_concurrency(); } - if (input_files.empty()) { + // Resolve files and index path with aggregation cache check + indexing::IndexResolverUtility resolver; + indexing::ResolverInput resolver_input; + resolver_input.directory = input.directory; + resolver_input.index_dir = input.index_dir; + resolver_input.require_aggregation = !input.force_rebuild; + resolver_input.aggregation_config = input.config; + auto scan_result = co_await scope.spawn(resolver, resolver_input); + + if (scan_result.all_files.empty()) { DFTRACER_UTILS_LOG_WARN("No .pfw or .pfw.gz files found in: %s", input.directory.c_str()); - co_yield AggregationBatch{}; co_return; } - // Sequential pipeline: index → metadata → chunk map → aggregate → merge. - // Parallelism at the file/chunk level is left to the caller (e.g. the - // CLI binary uses CoroScope workers; Python callers use the Runtime). - EventAggregatorUtility merger; - std::atomic global_chunk_idx{0}; - - if (input.force_rebuild && !input_files.empty()) { - const std::string shared_index_path = - composites::dft::internal::determine_index_path( - input_files.front(), effective_index_dir); - if (fs::exists(shared_index_path)) { - fs::remove_all(shared_index_path); - } + DFTRACER_UTILS_LOG_INFO( + "Found %zu files (%zu need checkpoint, %zu need aggregation, %zu " + "cached)", + scan_result.all_files.size(), scan_result.needs_checkpoint.size(), + scan_result.needs_aggregation.size(), scan_result.cached.size()); + + const auto& shared_index_path = scan_result.index_path; + + // Force rebuild: clear existing index + if (input.force_rebuild && fs::exists(shared_index_path)) { + DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s", + shared_index_path.c_str()); + fs::remove_all(shared_index_path); } - for (const auto& file_path : input_files) { - bool is_compressed = - file_path.size() >= 3 && - file_path.compare(file_path.size() - 3, 3, ".gz") == 0; - - std::string idx_path; - if (is_compressed) { - idx_path = composites::dft::internal::determine_index_path( - file_path, effective_index_dir); - auto idx_input = indexer::IndexBuildConfig::for_file(file_path) - .with_checkpoint_size(input.checkpoint_size) - .with_force_rebuild(false) - .with_index_dir(effective_index_dir); - co_await indexer::IndexBuilderUtility{}.process(idx_input); - } + // Open RocksDB-backed aggregator with merge operator + auto agg_db = EventAggregator::open_with_merge_operator(shared_index_path); + auto merger = std::make_unique(agg_db, 0); - // Collect file metadata (line count, size, etc.). - auto meta_input = - composites::dft::MetadataCollectorUtilityInput::from_file(file_path) - .with_checkpoint_size(input.checkpoint_size) - .with_force_rebuild(false) - .with_index(idx_path); - auto metadata = - co_await composites::dft::MetadataCollectorUtility{}.process( - meta_input); - - if (!metadata.success) { - DFTRACER_UTILS_LOG_WARN("Skipping file (metadata failed): %s", - file_path.c_str()); - continue; - } + // Build list of files needing work (checkpoint or aggregation) + std::vector files_needing_work; + files_needing_work.reserve(scan_result.needs_checkpoint.size() + + scan_result.needs_aggregation.size()); + for (auto& item : scan_result.needs_checkpoint) { + files_needing_work.push_back(std::move(item.file_path)); + } + for (auto& item : scan_result.needs_aggregation) { + files_needing_work.push_back(std::move(item.file_path)); + } - // Partition the file into byte-range chunks. - FileChunkMapperUtility file_mapper; - auto file_chunks = co_await file_mapper.process( - FileChunkMapperInput::from_metadata(metadata) - .with_config(input.config) - .with_checkpoint_size(input.checkpoint_size) - .with_target_chunk_size(input.chunk_size_mb) - .with_batch_size(input.batch_size_mb * 1024 * 1024)); - - int start_idx = - global_chunk_idx.fetch_add(static_cast(file_chunks.size())); - for (int i = 0; i < static_cast(file_chunks.size()); ++i) { - file_chunks[i].chunk_index = start_idx + i; + // Index and aggregate in parallel using fused pipeline + if (!files_needing_work.empty()) { + auto agg_config_ptr = std::make_shared(input.config); + + auto batch_config = std::make_shared(); + batch_config->file_paths = std::move(files_needing_work); + batch_config->index_dir = input.index_dir; + batch_config->checkpoint_size = input.checkpoint_size; + batch_config->parallelism = parallelism; + batch_config->force_rebuild = false; // Already handled above + batch_config->use_batch_write = true; + + // Attach AggregationVisitor to each file during parsing + batch_config->dft_visitor_factory = + [agg_db, agg_config_ptr](const std::string& file_path) + -> std::vector> { + std::vector> + visitors; + visitors.push_back(std::make_unique( + agg_db, 0, *agg_config_ptr, file_path)); + return visitors; + }; + + auto batch_result = co_await indexer::IndexBatchBuilderUtility::process( + &scope, std::move(batch_config)); + + // Drain visitors and merge results + std::vector processed_files; + for (auto& file_visitors : batch_result.extra_visitors) { + for (auto& visitor : file_visitors) { + auto* agg_visitor = + dynamic_cast(visitor.get()); + if (agg_visitor) { + for (const auto& k : agg_visitor->observed_extra_keys()) + merger->add_observed_extra_key(k); + for (const auto& m : agg_visitor->observed_custom_metrics()) + merger->add_observed_custom_metric(m); + auto output = agg_visitor->take_output(); + processed_files.push_back(output.file_path); + merger->merge_chunk(std::move(output)); + } + } + file_visitors.clear(); } - for (auto& chunk : file_chunks) { - ChunkAggregatorUtility agg; - auto output = co_await agg.process(chunk); - merger.merge_chunk(std::move(output)); + // Write global config and per-file markers for cache detection + if (!processed_files.empty()) { + namespace rcf = dftracer::utils::rocksdb::cf; + indexer::IndexDatabase idx_db( + shared_index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + + auto batch = agg_db->begin_batch(); + + // Write global config (0xFFFE key) + AggGlobalConfig global_cfg; + global_cfg.time_interval_us = input.config.time_interval_us; + global_cfg.config_hash = input.config.compute_hash(); + agg_db->put(batch, rcf::AGGREGATION, + std::string_view(AGG_GLOBAL_CONFIG_KEY, 2), + serialize_agg_global_config(global_cfg)); + + // Write per-file markers (0xFFFF + file_id keys) + for (const auto& file_path : processed_files) { + int file_id = idx_db.find_file(file_path); + if (file_id >= 0) { + agg_db->put(batch, rcf::AGGREGATION, + make_agg_file_key(file_id), ""); + } + } + + agg_db->commit_batch(batch); } } - // Finalize the merged aggregation map. - auto agg_results = merger.finalize(); - - // Resolve process-parent associations and boundary events. - AssociationResolverInput resolver_input; - resolver_input.trackers = std::move(agg_results.trackers); - resolver_input.aggregations = std::move(agg_results); - resolver_input.config = input.config; + // Get observed columns for consistent Arrow schema + auto obs = merger->observed_columns(); + auto global_extra_key_ids = + std::make_shared>(obs.extra_key_ids); + auto global_custom_metric_names = + std::make_shared>(obs.custom_metric_names); - AssociationResolverUtility resolver; - auto resolver_output = co_await resolver.process(resolver_input); + // Stable, deterministic schema ordering + std::sort(global_extra_key_ids->begin(), global_extra_key_ids->end()); + std::sort(global_custom_metric_names->begin(), + global_custom_metric_names->end()); - // Yield resolved aggregations in bounded batches, separated by type. + // Yield batches by scanning the merged aggregator const std::size_t batch_sz = input.event_batch_size; - const auto& resolved = resolver_output.aggregations; - - auto yield_map = [&](AggregationMap& map, AggregationBatchType type) - -> coro::AsyncGenerator { - AggregationBatch batch; - batch.batch_type = type; - batch.total_events_processed = resolved.total_events_processed; - batch.total_files_processed = resolved.total_files_processed; - batch.total_bytes_processed = resolved.total_bytes_processed; - for (auto& [key, metrics] : map) { - batch.entries.emplace_back(std::move(key), std::move(metrics)); - if (batch.entries.size() >= batch_sz) { - co_yield std::move(batch); - batch = AggregationBatch{}; - batch.batch_type = type; - batch.total_events_processed = resolved.total_events_processed; - batch.total_files_processed = resolved.total_files_processed; - batch.total_bytes_processed = resolved.total_bytes_processed; - } + + const std::size_t total_events = merger->total_events(); + const std::size_t total_files = merger->total_files(); + + auto make_batch = [&](AggregationBatchType type) { + AggregationBatch b; + b.batch_type = type; + b.total_events_processed = total_events; + b.total_files_processed = total_files; + b.global_extra_key_ids = global_extra_key_ids.get(); + b.global_custom_metric_names = global_custom_metric_names.get(); + return b; + }; + + // Collect entries grouped by type (scan callback is synchronous) + std::vector event_entries; + std::vector profile_entries; + std::vector system_entries; + + std::size_t total_keys = 0; + merger->scan([&](AggMapType map_type, const AggregationKey& key, + AggregationMetrics& metrics) { + total_keys++; + switch (map_type) { + case AggMapType::EVENT: + event_entries.emplace_back(key, std::move(metrics)); + break; + case AggMapType::PROFILE: + profile_entries.emplace_back(key, std::move(metrics)); + break; + case AggMapType::SYSTEM: + system_entries.emplace_back(key, std::move(metrics)); + break; } - if (!batch.entries.empty()) { - co_yield std::move(batch); + return true; + }); + + // Setup augmentation if needed + std::optional aug_config; + if (scan_result.needs_augmentation) { + aug_config = AugmentationConfig{scan_result.stored_time_interval_us, + input.config.time_interval_us}; + DFTRACER_UTILS_LOG_INFO("Augmenting time interval: %lu us -> %lu us", + scan_result.stored_time_interval_us, + input.config.time_interval_us); + } + + auto yield_batch = [&](AggregationBatch batch) -> AggregationBatch { + if (aug_config) { + return augment_batch(batch, *aug_config); } + return batch; }; - // Events - auto event_gen = yield_map(resolver_output.aggregations.aggregations, - AggregationBatchType::EVENT); - while (auto b = co_await event_gen.next()) co_yield std::move(*b); - - // Profiles - auto profile_gen = - yield_map(resolver_output.aggregations.profile_aggregations, - AggregationBatchType::PROFILE); - while (auto b = co_await profile_gen.next()) co_yield std::move(*b); - - // System - auto system_gen = - yield_map(resolver_output.aggregations.system_aggregations, - AggregationBatchType::SYSTEM); - while (auto b = co_await system_gen.next()) co_yield std::move(*b); - - // Clean up the temporary index directory if we created it. - if (!temp_index_dir.empty()) { - std::error_code ec; - fs::remove_all(temp_index_dir, ec); + // Yield event batches + for (std::size_t i = 0; i < event_entries.size(); i += batch_sz) { + AggregationBatch batch = make_batch(AggregationBatchType::EVENT); + std::size_t end = std::min(i + batch_sz, event_entries.size()); + for (std::size_t j = i; j < end; ++j) { + batch.entries.push_back(std::move(event_entries[j])); + } + co_yield yield_batch(std::move(batch)); + } + + // Yield profile batches + for (std::size_t i = 0; i < profile_entries.size(); i += batch_sz) { + AggregationBatch batch = make_batch(AggregationBatchType::PROFILE); + std::size_t end = std::min(i + batch_sz, profile_entries.size()); + for (std::size_t j = i; j < end; ++j) { + batch.entries.push_back(std::move(profile_entries[j])); + } + co_yield yield_batch(std::move(batch)); } + + // Yield system batches + for (std::size_t i = 0; i < system_entries.size(); i += batch_sz) { + AggregationBatch batch = make_batch(AggregationBatchType::SYSTEM); + std::size_t end = std::min(i + batch_sz, system_entries.size()); + for (std::size_t j = i; j < end; ++j) { + batch.entries.push_back(std::move(system_entries[j])); + } + co_yield yield_batch(std::move(batch)); + } + + DFTRACER_UTILS_LOG_INFO("Aggregation complete: %zu keys", total_keys); } } // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp index c9fc8f00..5df75775 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp @@ -109,7 +109,7 @@ coro::CoroTask AssociationResolverUtility::process( void AssociationResolverUtility::compute_trace_metadata( const AssociationTracker& tracker, - const EventAggregatorUtilityOutput& /*aggregations*/, + const EventAggregatorOutput& /*aggregations*/, AssociationResolverOutput& output) { const auto& intervals = tracker.get_all_intervals(); diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp index 67ac119a..eab0cc9a 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp @@ -1,15 +1,15 @@ +#include #include #include namespace dftracer::utils::utilities::composites::dft::aggregators { -void AssociationTracker::extract_from_event(const JsonValue& json, - const JsonValue& args, +void AssociationTracker::extract_from_event(std::string_view name, + std::uint64_t pid, std::uint64_t ts, + std::uint64_t dur, + const ArgsMap& args, const AggregationConfig& config) { - std::string_view name = json["name"].get(); - std::uint64_t pid = json["pid"].get(); - if (config.track_process_parents && pid > 0) { all_pids_.insert(pid); } @@ -38,9 +38,6 @@ void AssociationTracker::extract_from_event(const JsonValue& json, final_value = std::to_string(counter); } - std::uint64_t ts = json["ts"].get(); - std::uint64_t dur = json["dur"].get(); - BoundaryInterval interval; interval.name = boundary_config.output_name; interval.value = final_value; @@ -132,4 +129,116 @@ void AssociationTracker::merge(const AssociationTracker& other) { } } +namespace { +namespace rocks = dftracer::utils::rocksdb; + +void put_be64(std::string& out, std::uint64_t v) { + rocks::KeyCodec::append_be64(out, v); +} +void put_be32(std::string& out, std::uint32_t v) { + rocks::KeyCodec::append_be32(out, v); +} +void put_str(std::string& out, const std::string& s) { + put_be32(out, static_cast(s.size())); + out.append(s); +} +std::uint64_t read_be64(const char*& p) { + auto v = rocks::KeyCodec::decode_be64(std::string_view(p, 8)); + p += 8; + return v; +} +std::uint32_t read_be32(const char*& p) { + auto v = rocks::KeyCodec::decode_be32(std::string_view(p, 4)); + p += 4; + return v; +} +std::string read_str(const char*& p) { + auto len = read_be32(p); + std::string s(p, len); + p += len; + return s; +} +} // namespace + +std::string AssociationTracker::serialize() const { + std::string out; + out.reserve(4096); + + put_be32(out, static_cast(all_pids_.size())); + for (auto pid : all_pids_) put_be64(out, pid); + + put_be32(out, static_cast(process_parents_.size())); + for (const auto& [child, parent] : process_parents_) { + put_be64(out, child); + put_be64(out, parent); + } + + put_be32(out, static_cast(all_intervals_.size())); + for (const auto& iv : all_intervals_) { + put_str(out, iv.name); + put_str(out, iv.value); + put_be64(out, iv.start_ts); + put_be64(out, iv.end_ts); + } + + put_be32(out, static_cast(process_intervals_.size())); + for (const auto& [pid, intervals] : process_intervals_) { + put_be64(out, pid); + put_be32(out, static_cast(intervals.size())); + for (const auto& iv : intervals) { + put_str(out, iv.name); + put_str(out, iv.value); + put_be64(out, iv.start_ts); + put_be64(out, iv.end_ts); + } + } + + return out; +} + +AssociationTracker AssociationTracker::deserialize(std::string_view data) { + AssociationTracker t; + const char* p = data.data(); + + auto num_pids = read_be32(p); + for (std::uint32_t i = 0; i < num_pids; ++i) + t.all_pids_.insert(read_be64(p)); + + auto num_parents = read_be32(p); + for (std::uint32_t i = 0; i < num_parents; ++i) { + auto child = read_be64(p); + auto parent = read_be64(p); + t.process_parents_[child] = parent; + } + + auto num_intervals = read_be32(p); + t.all_intervals_.reserve(num_intervals); + for (std::uint32_t i = 0; i < num_intervals; ++i) { + BoundaryInterval iv; + iv.name = read_str(p); + iv.value = read_str(p); + iv.start_ts = read_be64(p); + iv.end_ts = read_be64(p); + t.all_intervals_.push_back(std::move(iv)); + } + + auto num_pid_intervals = read_be32(p); + for (std::uint32_t i = 0; i < num_pid_intervals; ++i) { + auto pid = read_be64(p); + auto count = read_be32(p); + auto& vec = t.process_intervals_[pid]; + vec.reserve(count); + for (std::uint32_t j = 0; j < count; ++j) { + BoundaryInterval iv; + iv.name = read_str(p); + iv.value = read_str(p); + iv.start_ts = read_be64(p); + iv.end_ts = read_be64(p); + vec.push_back(std::move(iv)); + } + } + + return t; +} + } // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp index 4618338d..9ba717cf 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp @@ -1,190 +1,15 @@ #include -#include -#include +#include #include #include -#include -#include -#include -#include #include -#include -#include -#include #include namespace dftracer::utils::utilities::composites::dft::aggregators { using dftracer::utils::utilities::composites::dft::DFTracerEvent; -namespace { - -void apply_preaggregated_metric(MetricStats& stats, std::uint64_t count, - const JsonValue& sum_val, - const JsonValue& min_val, - const JsonValue& max_val) { - if (!sum_val.exists()) return; - - const auto total = sum_val.get(); - stats.total += total; - if (min_val.exists()) { - stats.min = std::min(stats.min, min_val.get()); - } - if (max_val.exists()) { - stats.max = std::max(stats.max, max_val.get()); - } - - if (count > 0) { - stats.mean = - static_cast(stats.total) / static_cast(count); - // Counter rows carry pre-aggregated totals/min/max. Higher moments - // are not available from the source trace, so stddev/skew/kurtosis - // remain 0 unless enough information is accumulated elsewhere. - stats.m2 = 0.0; - stats.m3 = 0.0; - stats.m4 = 0.0; - } -} - -} // namespace - -std::uint64_t ChunkAggregatorUtility::compute_time_bucket( - std::uint64_t timestamp, std::uint64_t duration, - const AggregationConfig& config) const { - std::uint64_t midpoint = timestamp + (duration / 2); - - if (config.use_relative_time) { - midpoint -= config.reference_timestamp; - } - if (config.time_interval_us == 0) return midpoint; - return (midpoint / config.time_interval_us) * config.time_interval_us; -} - -AggregationKey ChunkAggregatorUtility::build_key( - const DFTracerEvent& ev, const AggregationConfig& config) const { - auto& intern = aggregation_intern(); - - AggregationKey key; - key.cat_id = intern.get_or_insert(ev.cat); - key.name_id = intern.get_or_insert(ev.name); - key.pid = ev.pid; - key.tid = ev.tid; - - auto hhash_sv = ev.args["hhash"].get(); - if (!hhash_sv.empty()) { - key.hhash_id = intern.get_or_insert(hhash_sv); - } - auto fhash_sv = ev.args["fhash"].get(); - if (!fhash_sv.empty()) { - key.fhash_id = intern.get_or_insert(fhash_sv); - } - - key.time_bucket = compute_time_bucket(ev.ts, ev.dur, config); - - if (!config.extra_group_keys.empty()) { - key.extra_keys = std::make_unique< - std::vector>>(); - for (const auto& extra_key : config.extra_group_keys) { - std::string_view value = ev.args[extra_key].get(); - if (!value.empty()) { - key.extra_keys->emplace_back(intern.get_or_insert(extra_key), - intern.get_or_insert(value)); - } - } - } - - return key; -} - -void ChunkAggregatorUtility::update_entry(const DFTracerEvent& ev, - const AggregationConfig& config, - AggregationMap& aggregations, - const AggregationKey& key) { - auto it = aggregations.find(key); - if (it == aggregations.end()) { - it = aggregations - .emplace(key, AggregationMetrics(config.sketch_accuracy)) - .first; - } - auto& metrics = it->second; - - if (ev.is_counter()) { - // Profile/system events carry pre-aggregated data in args. - // Use args.count for the event count, args.dur_sum for total duration, - // etc. - JsonValue a_count = ev.args["dft_cnt"]; - if (!a_count.exists()) a_count = ev.args["count"]; - std::uint64_t ev_count = - a_count.exists() ? a_count.get() : 1; - metrics.count += ev_count; - - JsonValue a_dur = ev.args["dur_sum"]; - if (!a_dur.exists()) a_dur = ev.args["dur"]; - JsonValue a_dur_min = ev.args["dur_min"]; - if (!a_dur_min.exists()) a_dur_min = ev.args["dur"]; - JsonValue a_dur_max = ev.args["dur_max"]; - if (!a_dur_max.exists()) a_dur_max = ev.args["dur"]; - apply_preaggregated_metric(metrics.duration, metrics.count, a_dur, - a_dur_min, a_dur_max); - - JsonValue a_size_sum = ev.args["ret_sum"]; - if (!a_size_sum.exists()) a_size_sum = ev.args["ret"]; - JsonValue a_size_min = ev.args["ret_min"]; - if (!a_size_min.exists()) a_size_min = ev.args["ret"]; - JsonValue a_size_max = ev.args["ret_max"]; - if (!a_size_max.exists()) a_size_max = ev.args["ret"]; - apply_preaggregated_metric(metrics.size, metrics.count, a_size_sum, - a_size_min, a_size_max); - - metrics.update_timestamp(ev.ts, config.time_interval_us); - } else { - // Regular events: count += 1, use event's own dur/size. - metrics.update_duration(ev.dur, config.compute_percentiles); - metrics.update_timestamp(ev.ts, ev.dur); - - JsonValue ret = ev.args["ret"]; - if (ret.exists() && - internal::is_data_transfer_op(key.cat(), key.name())) { - std::uint64_t size = ret.get(); - metrics.update_size(size, config.compute_percentiles); - } - } - - if (!config.custom_metric_fields.empty()) { - for (const auto& field : config.custom_metric_fields) { - if (ev.is_counter()) { - // Profile/system: read pre-aggregated field_sum/min/max - std::string sum_key = field + "_sum"; - JsonValue a_sum = ev.args[sum_key]; - if (!a_sum.exists()) a_sum = ev.args[field]; - std::string min_key = field + "_min"; - JsonValue a_min = ev.args[min_key]; - if (!a_min.exists()) a_min = ev.args[field]; - std::string max_key = field + "_max"; - JsonValue a_max = ev.args[max_key]; - if (!a_max.exists()) a_max = ev.args[field]; - if (a_sum.exists() || a_min.exists() || a_max.exists()) { - if (!metrics.custom_metrics) { - metrics.custom_metrics = - std::make_unique(); - } - auto& ms = (*metrics.custom_metrics)[field]; - apply_preaggregated_metric(ms, metrics.count, a_sum, a_min, - a_max); - } - } else { - JsonValue field_val = ev.args[field]; - if (field_val.exists()) { - std::uint64_t value = field_val.get(); - metrics.update_custom_metric(field, value, - config.compute_percentiles); - } - } - } - } -} - coro::CoroTask ChunkAggregatorUtility::process( const ChunkAggregatorInput& input) { ChunkAggregationOutput output; @@ -217,8 +42,11 @@ coro::CoroTask ChunkAggregatorUtility::process( rc.start_byte = input.start_byte; rc.end_byte = input.end_byte; rc.buffer_size = input.batch_size; + if (input.query) { + rc.query = input.query->source(); + } - auto line_gen = trace_reader.read_lines(rc); + auto json_gen = trace_reader.read_json(rc); AggregationMap local_aggregations; AggregationMap local_profiles; @@ -230,51 +58,24 @@ coro::CoroTask ChunkAggregatorUtility::process( local_tracker = std::make_shared(); } - char yy_buf[common::json::YYJSON_LINE_POOL_SIZE]; - yyjson_alc yy_alc; - yyjson_alc_pool_init(&yy_alc, yy_buf, sizeof(yy_buf)); - - while (auto opt = co_await line_gen.next()) { - const char* line_start = opt->content.data(); - std::size_t line_len = opt->content.size(); - if (line_len == 0) continue; + while (auto opt = co_await json_gen.next()) { + DFTracerEvent ev; + if (!DFTracerEvent::parse_ondemand(*opt->parser, ev)) continue; + if (ev.is_metadata()) continue; - yyjson_doc* doc = - yyjson_read_opts(const_cast(line_start), line_len, - YYJSON_READ_NOFLAG, &yy_alc, nullptr); - if (!doc) continue; - - yyjson_val* root = yyjson_doc_get_root(doc); - if (root && yyjson_is_obj(root)) { - bool pass = true; - if (input.query) { - JsonValue json(root); - std::string_view ph = json["ph"].get(); - if (ph != "M") { - pass = input.query->evaluate(json); - } - } - if (pass) { - DFTracerEvent ev; - if (DFTracerEvent::parse(root, ev) && !ev.is_metadata()) { - if (local_tracker) { - JsonValue json(root); - local_tracker->extract_from_event(json, ev.args, - input.config); - } - auto key = build_key(ev, input.config); - if (ev.is_system()) { - update_entry(ev, input.config, local_system, key); - } else if (ev.is_profile()) { - update_entry(ev, input.config, local_profiles, key); - } else { - update_entry(ev, input.config, local_aggregations, key); - } - output.events_processed++; - } - } + if (local_tracker) { + local_tracker->extract_from_event(ev.name, ev.pid, ev.ts, ev.dur, + ev.args, input.config); + } + auto key = build_aggregation_key(ev, input.config); + if (ev.is_system()) { + update_aggregation_entry(ev, input.config, local_system, key); + } else if (ev.is_profile()) { + update_aggregation_entry(ev, input.config, local_profiles, key); + } else { + update_aggregation_entry(ev, input.config, local_aggregations, key); } - yyjson_doc_free(doc); + output.events_processed++; } if (local_tracker) { diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp new file mode 100644 index 00000000..a840a2a0 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp @@ -0,0 +1,468 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +namespace rcf = dftracer::utils::rocksdb::cf; +namespace rocks = dftracer::utils::rocksdb; + +static constexpr std::string_view TIME_BOUNDS_DB_KEY = "__time_bounds__"; + +EventAggregator::EventAggregator() : rocksdb_mode_(false) {} + +EventAggregator::EventAggregator(std::shared_ptr db, + std::uint32_t config_hash) + : rocksdb_mode_(true), db_(std::move(db)), config_hash_(config_hash) { + load_intern_dictionary(*db_); +} + +void EventAggregator::merge_chunk(ChunkAggregationOutput&& chunk_output) { + if (rocksdb_mode_) { + merge_chunk_rocksdb(std::move(chunk_output)); + } else { + merge_chunk_memory(std::move(chunk_output)); + } +} + +void EventAggregator::merge_chunk_memory( + ChunkAggregationOutput&& chunk_output) { + if (!chunk_output.success) return; + + state_.total_events_processed += chunk_output.events_processed; + state_.total_bytes_processed += chunk_output.bytes_processed; + unique_files_.insert(chunk_output.file_path); + + auto merge_into = [](AggregationMap& dst, AggregationMap& src) { + for (auto& [key, metrics] : src) { + auto it = dst.find(key); + if (it == dst.end()) { + dst.emplace(key, std::move(metrics)); + } else { + it->second.merge_from(metrics); + } + } + }; + merge_into(state_.aggregations, chunk_output.aggregations); + merge_into(state_.profile_aggregations, chunk_output.profile_aggregations); + merge_into(state_.system_aggregations, chunk_output.system_aggregations); + + if (chunk_output.local_tracker) { + state_.trackers.push_back(std::move(chunk_output.local_tracker)); + } + + update_time_bounds(chunk_output.min_time_bucket); + update_time_bounds(chunk_output.max_time_bucket); +} + +void EventAggregator::merge_chunk_rocksdb( + ChunkAggregationOutput&& chunk_output) { + if (!chunk_output.success) return; + + total_events_ += chunk_output.events_processed; + total_bytes_ += chunk_output.bytes_processed; + + update_time_bounds(chunk_output.min_time_bucket); + update_time_bounds(chunk_output.max_time_bucket); + + unique_files_.insert(std::move(chunk_output.file_path)); + + if (chunk_output.local_tracker) { + trackers_.push_back(std::move(chunk_output.local_tracker)); + } +} + +void EventAggregator::add_observed_extra_key(const std::string& key) { + auto& intern = aggregation_intern(); + observed_extra_key_ids_.insert(intern.get_or_insert(key)); +} + +void EventAggregator::add_observed_custom_metric(const std::string& name) { + observed_custom_metric_names_.insert(name); +} + +EventAggregatorOutput EventAggregator::finalize() { + if (rocksdb_mode_) { + EventAggregatorOutput output; + output.total_events_processed = total_events_.load(); + output.total_bytes_processed = total_bytes_.load(); + output.total_files_processed = unique_files_.size(); + output.trackers = std::move(trackers_); + + scan([&output](AggMapType map_type, const AggregationKey& key, + AggregationMetrics& metrics) { + switch (map_type) { + case AggMapType::PROFILE: + output.profile_aggregations.emplace(key, + std::move(metrics)); + break; + case AggMapType::SYSTEM: + output.system_aggregations.emplace(key, std::move(metrics)); + break; + default: + output.aggregations.emplace(key, std::move(metrics)); + break; + } + return true; + }); + + output.success = true; + + auto min_tb = min_time_bucket_.load(std::memory_order_relaxed); + auto max_tb = max_time_bucket_.load(std::memory_order_relaxed); + if (min_tb != UINT64_MAX && max_tb != 0 && min_tb <= max_tb) { + std::string time_bounds_val = rocks::KeyCodec::encode_be64(min_tb); + time_bounds_val += rocks::KeyCodec::encode_be64(max_tb); + db_->put(TIME_BOUNDS_DB_KEY, time_bounds_val, rcf::AGGREGATION); + } + + DFTRACER_UTILS_LOG_INFO( + "Aggregation complete: %zu unique keys, %zu total events, %zu " + "files", + output.aggregations.size(), output.total_events_processed, + output.total_files_processed); + + return output; + } + + state_.total_files_processed = unique_files_.size(); + state_.success = true; + + DFTRACER_UTILS_LOG_INFO( + "Aggregation complete: %zu unique keys, %zu total events, %zu files", + state_.aggregations.size(), state_.total_events_processed, + state_.total_files_processed); + + return std::move(state_); +} + +std::size_t EventAggregator::scan(ScanCallback callback) const { + if (!rocksdb_mode_) { + std::size_t count = 0; + auto scan_map = [&](const AggregationMap& map, AggMapType map_type) { + for (auto& [key, metrics] : map) { + count++; + auto& mutable_metrics = + const_cast(metrics); + if (!callback(map_type, key, mutable_metrics)) return false; + } + return true; + }; + if (!scan_map(state_.aggregations, AggMapType::EVENT)) return count; + if (!scan_map(state_.profile_aggregations, AggMapType::PROFILE)) + return count; + scan_map(state_.system_aggregations, AggMapType::SYSTEM); + return count; + } + + return scan_shard_range(0, AGG_KEY_NUM_SHARDS, callback); +} + +std::size_t EventAggregator::scan_shard_range_raw_fn(std::uint16_t shard_begin, + std::uint16_t shard_end, + RawScanCallbackFn fn, + void* ctx) const { + if (!rocksdb_mode_ || !db_) return 0; + + char begin_key[2]; + begin_key[0] = static_cast(shard_begin >> 8); + begin_key[1] = static_cast(shard_begin); + + auto it = db_->new_iterator(rcf::AGGREGATION); + std::size_t count = 0; + for (it->Seek({begin_key, 2}); it->Valid(); it->Next()) { + auto key_slice = it->key(); + if (key_slice.size() < 3) continue; + std::uint16_t shard = (static_cast(key_slice[0]) << 8) | + static_cast(key_slice[1]); + if (shard >= AGG_KEY_NUM_SHARDS) break; + if (shard >= shard_end) break; + + auto val_slice = it->value(); + count++; + if (!fn(ctx, std::string_view(key_slice.data(), key_slice.size()), + std::string_view(val_slice.data(), val_slice.size()))) + break; + } + return count; +} + +std::size_t EventAggregator::scan_shard_range(std::uint16_t shard_begin, + std::uint16_t shard_end, + ScanCallback callback) const { + if (!rocksdb_mode_ || !db_) return 0; + + char begin_key[2]; + begin_key[0] = static_cast(shard_begin >> 8); + begin_key[1] = static_cast(shard_begin); + + auto it = db_->new_iterator(rcf::AGGREGATION); + std::size_t count = 0; + for (it->Seek({begin_key, 2}); it->Valid(); it->Next()) { + auto key_slice = it->key(); + if (key_slice.size() < 3) continue; + std::uint16_t shard = (static_cast(key_slice[0]) << 8) | + static_cast(key_slice[1]); + if (shard >= AGG_KEY_NUM_SHARDS) break; + if (shard >= shard_end) break; + + auto val_slice = it->value(); + auto deserialized = deserialize_agg_key( + std::string_view(key_slice.data(), key_slice.size())); + auto metrics = deserialize_agg_value( + std::string_view(val_slice.data(), val_slice.size())); + + count++; + if (!callback(deserialized.map_type, deserialized.key, metrics)) break; + } + return count; +} + +namespace { + +std::string serialize_observed_columns( + const std::set& extra_key_ids, + const std::set& custom_metric_names) { + namespace rocks = dftracer::utils::rocksdb; + auto& intern = aggregation_intern(); + std::string out; + auto put_str = [&](std::string_view s) { + rocks::KeyCodec::append_be32(out, static_cast(s.size())); + out.append(s.data(), s.size()); + }; + + rocks::KeyCodec::append_be32( + out, static_cast(extra_key_ids.size())); + for (auto id : extra_key_ids) put_str(intern.resolve(id)); + + rocks::KeyCodec::append_be32( + out, static_cast(custom_metric_names.size())); + for (const auto& name : custom_metric_names) put_str(name); + + return out; +} + +void deserialize_observed_columns(std::string_view data, + std::set& extra_key_ids, + std::set& custom_metric_names) { + namespace rocks = dftracer::utils::rocksdb; + auto& intern = aggregation_intern(); + std::size_t off = 0; + auto read_u32 = [&]() -> std::uint32_t { + if (off + 4 > data.size()) return 0; + auto v = rocks::KeyCodec::decode_be32(data.substr(off, 4)); + off += 4; + return v; + }; + auto read_str = [&]() -> std::string_view { + auto len = read_u32(); + if (off + len > data.size()) return {}; + auto sv = data.substr(off, len); + off += len; + return sv; + }; + + auto n_extra = read_u32(); + for (std::uint32_t i = 0; i < n_extra; ++i) { + auto sv = read_str(); + if (!sv.empty()) extra_key_ids.insert(intern.get_or_insert(sv)); + } + + auto n_metrics = read_u32(); + for (std::uint32_t i = 0; i < n_metrics; ++i) { + auto sv = read_str(); + if (!sv.empty()) custom_metric_names.emplace(sv); + } +} + +} // namespace + +static constexpr std::string_view COLUMNS_DB_KEY = "__observed_columns__"; + +EventAggregator::ObservedColumns EventAggregator::observed_columns() { + if (rocksdb_mode_ && db_) { + std::string val; + if (db_->get(COLUMNS_DB_KEY, &val, rcf::AGGREGATION).ok() && + !val.empty()) { + deserialize_observed_columns(val, observed_extra_key_ids_, + observed_custom_metric_names_); + } + + auto serialized = serialize_observed_columns( + observed_extra_key_ids_, observed_custom_metric_names_); + db_->put(COLUMNS_DB_KEY, serialized, rcf::AGGREGATION); + } + + ObservedColumns result; + result.extra_key_ids.assign(observed_extra_key_ids_.begin(), + observed_extra_key_ids_.end()); + result.custom_metric_names.assign(observed_custom_metric_names_.begin(), + observed_custom_metric_names_.end()); + return result; +} + +std::vector> +EventAggregator::take_trackers() { + return std::move(trackers_); +} + +static constexpr std::string_view TRACKER_DB_KEY = "__tracker__"; + +std::unique_ptr EventAggregator::build_global_tracker() { + auto tracker = std::make_unique(); + + for (const auto& t : trackers_) { + if (t) tracker->merge(*t); + } + trackers_.clear(); + + if (rocksdb_mode_ && db_) { + std::string val; + if (db_->get(TRACKER_DB_KEY, &val, rcf::AGGREGATION).ok() && + !val.empty()) { + tracker->merge(AssociationTracker::deserialize(val)); + } + } + + tracker->finalize(); + + if (rocksdb_mode_ && db_) { + db_->put(TRACKER_DB_KEY, tracker->serialize(), rcf::AGGREGATION); + } + + return tracker; +} + +std::shared_ptr +EventAggregator::open_with_merge_operator(const std::string& index_path) { + auto agg_merge_op = std::make_shared(); + auto sys_merge_op = std::make_shared(); + auto cf_override = [agg_merge_op, sys_merge_op]( + const std::string& cf_name, + ::rocksdb::ColumnFamilyOptions& opts) { + if (cf_name == rcf::AGGREGATION) { + opts.merge_operator = agg_merge_op; + + ::rocksdb::BlockBasedTableOptions bbt; + bbt.block_size = 32 * 1024; + bbt.format_version = 5; + bbt.index_block_restart_interval = 16; + bbt.whole_key_filtering = false; + opts.table_factory.reset(::rocksdb::NewBlockBasedTableFactory(bbt)); + + opts.level0_file_num_compaction_trigger = 2; + opts.max_bytes_for_level_multiplier = 20; + +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + opts.compression = ::rocksdb::kZSTD; + opts.compression_opts.level = 9; + opts.compression_opts.max_dict_bytes = 262144; + opts.compression_opts.zstd_max_train_bytes = 1048576; + opts.compression_opts.enabled = true; + opts.bottommost_compression = ::rocksdb::kZSTD; + opts.bottommost_compression_opts.level = 9; + opts.bottommost_compression_opts.max_dict_bytes = 262144; + opts.bottommost_compression_opts.zstd_max_train_bytes = 1048576; + opts.bottommost_compression_opts.enabled = true; +#elif defined(DFTRACER_UTILS_ENABLE_LZ4) + opts.compression = ::rocksdb::kLZ4Compression; + opts.bottommost_compression = ::rocksdb::kLZ4Compression; +#else + opts.compression = ::rocksdb::kZlibCompression; + opts.bottommost_compression = ::rocksdb::kZlibCompression; +#endif + } else if (cf_name == rcf::SYSTEM_METRICS) { + opts.merge_operator = sys_merge_op; +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + opts.compression = ::rocksdb::kZSTD; + opts.bottommost_compression = ::rocksdb::kZSTD; +#elif defined(DFTRACER_UTILS_ENABLE_LZ4) + opts.compression = ::rocksdb::kLZ4Compression; + opts.bottommost_compression = ::rocksdb::kLZ4Compression; +#else + opts.compression = ::rocksdb::kZlibCompression; + opts.bottommost_compression = ::rocksdb::kZlibCompression; +#endif + } + }; + auto& mgr = rocksdb::RocksDBManager::instance(); + mgr.reset(index_path); + return mgr.get_or_open( + index_path, rocksdb::RocksDatabase::OpenMode::ReadWrite, cf_override); +} + +std::shared_ptr +EventAggregator::open_read_only_with_merge_operator( + const std::string& index_path) { + auto agg_merge_op = std::make_shared(); + auto sys_merge_op = std::make_shared(); + auto cf_override = [agg_merge_op, sys_merge_op]( + const std::string& cf_name, + ::rocksdb::ColumnFamilyOptions& opts) { + if (cf_name == rcf::AGGREGATION) { + opts.merge_operator = agg_merge_op; + } else if (cf_name == rcf::SYSTEM_METRICS) { + opts.merge_operator = sys_merge_op; + } + }; + auto& mgr = rocksdb::RocksDBManager::instance(); + mgr.reset(index_path); + return mgr.get_or_open( + index_path, rocksdb::RocksDatabase::OpenMode::ReadOnly, cf_override); +} + +void EventAggregator::update_time_bounds(std::uint64_t time_bucket) { + std::uint64_t old_min = min_time_bucket_.load(std::memory_order_relaxed); + while (time_bucket < old_min && + !min_time_bucket_.compare_exchange_weak(old_min, time_bucket, + std::memory_order_relaxed)) { + } + + std::uint64_t old_max = max_time_bucket_.load(std::memory_order_relaxed); + while (time_bucket > old_max && + !max_time_bucket_.compare_exchange_weak(old_max, time_bucket, + std::memory_order_relaxed)) { + } +} + +std::uint64_t EventAggregator::min_time_bucket() const { + return min_time_bucket_.load(std::memory_order_relaxed); +} + +std::uint64_t EventAggregator::max_time_bucket() const { + return max_time_bucket_.load(std::memory_order_relaxed); +} + +EventAggregator::TimeBoundsResult EventAggregator::query_time_bounds() const { + TimeBoundsResult result; + + if (rocksdb_mode_ && db_) { + std::string val; + if (db_->get(TIME_BOUNDS_DB_KEY, &val, rcf::AGGREGATION).ok() && + val.size() >= 16) { + result.min_time_bucket = rocks::KeyCodec::decode_be64( + std::string_view(val).substr(0, 8)); + result.max_time_bucket = rocks::KeyCodec::decode_be64( + std::string_view(val).substr(8, 8)); + result.valid = true; + return result; + } + } + + std::uint64_t min_val = min_time_bucket_.load(std::memory_order_relaxed); + std::uint64_t max_val = max_time_bucket_.load(std::memory_order_relaxed); + result.min_time_bucket = min_val; + result.max_time_bucket = max_val; + result.valid = + (min_val != UINT64_MAX && max_val != 0 && min_val <= max_val); + return result; +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp deleted file mode 100644 index 5eb27e5f..00000000 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::aggregators { - -void EventAggregatorUtility::merge_chunk( - ChunkAggregationOutput&& chunk_output) { - if (!chunk_output.success) return; - - state_.total_events_processed += chunk_output.events_processed; - state_.total_bytes_processed += chunk_output.bytes_processed; - unique_files_.insert(chunk_output.file_path); - - auto merge_into = [](AggregationMap& dst, AggregationMap& src) { - for (auto& [key, metrics] : src) { - auto it = dst.find(key); - if (it == dst.end()) { - dst.emplace(key, std::move(metrics)); - } else { - it->second.merge_from(metrics); - } - } - }; - merge_into(state_.aggregations, chunk_output.aggregations); - merge_into(state_.profile_aggregations, chunk_output.profile_aggregations); - merge_into(state_.system_aggregations, chunk_output.system_aggregations); - - if (chunk_output.local_tracker) { - state_.trackers.push_back(std::move(chunk_output.local_tracker)); - } -} - -EventAggregatorUtilityOutput EventAggregatorUtility::finalize() { - state_.total_files_processed = unique_files_.size(); - state_.success = true; - - DFTRACER_UTILS_LOG_INFO( - "Aggregation complete: %zu unique keys, %zu total events, %zu files", - state_.aggregations.size(), state_.total_events_processed, - state_.total_files_processed); - - return std::move(state_); -} - -coro::CoroTask EventAggregatorUtility::process( - const EventAggregatorUtilityInput& input) { - for (auto& output : const_cast&>( - input.chunk_outputs)) { - merge_chunk(std::move(output)); - } - - co_return finalize(); -} - -} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp index a0d60260..ba0606d8 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp @@ -1,449 +1,706 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include #include #include +#include +#include +#include #include #include #include -#include +#include #include +#include +#include #include +#include #include #include namespace dftracer::utils::utilities::composites::dft::aggregators { -std::uint64_t PerfettoTraceWriterUtility::generate_synthetic_tid( - const AggregationKey& key) const { - dftracer::utils::utilities::hash::HasherUtility hasher; - std::string key_str = - std::string(key.cat()) + ":" + std::string(key.name()) + ":" + - std::to_string(key.pid) + ":" + std::to_string(key.time_bucket); +namespace { - if (!key.fhash().empty()) { - key_str += ":"; - key_str += key.fhash(); +class JsonBuffer { + public: + explicit JsonBuffer(std::size_t capacity) + : data_(std::make_unique(capacity)), + capacity_(capacity), + size_(0) {} + + void append(const char* ptr, std::size_t len) { + std::memcpy(data_.get() + size_, ptr, len); + size_ += len; + } + + void append(std::string_view sv) { append(sv.data(), sv.size()); } + + void push_back(char c) { data_[size_++] = c; } + + template + void append_literal(const char (&lit)[N]) { + append(lit, N - 1); + } + + void append_u64(std::uint64_t v) { + auto res = + std::to_chars(data_.get() + size_, data_.get() + capacity_, v); + size_ = static_cast(res.ptr - data_.get()); + } + + void append_i64(std::int64_t v) { + auto res = + std::to_chars(data_.get() + size_, data_.get() + capacity_, v); + size_ = static_cast(res.ptr - data_.get()); } - if (key.extra_keys) { - auto& intern = aggregation_intern(); - for (const auto& [k, v] : *key.extra_keys) { - key_str += ":"; - key_str += intern.resolve(k); - key_str += "="; - key_str += intern.resolve(v); + void append_double(double value) { + int n; + if (std::abs(value - std::round(value)) < 1e-9) { + n = std::snprintf(data_.get() + size_, capacity_ - size_, "%lld", + static_cast(std::round(value))); + } else { + n = std::snprintf(data_.get() + size_, capacity_ - size_, "%.2f", + value); } + size_ += static_cast(n); } - // CPU-bound hash — .get() intentional - std::size_t hash = hasher.process(key_str).get().value; - return 1000000000ULL + (hash % 1000000ULL); -} + int format(const char* fmt, ...) __attribute__((format(printf, 2, 3))) { + va_list ap; + va_start(ap, fmt); + int n = std::vsnprintf(data_.get() + size_, capacity_ - size_, fmt, ap); + va_end(ap); + size_ += static_cast(n); + return n; + } -void PerfettoTraceWriterUtility::append_json_string( - std::string& buffer, std::string_view str) const { - for (char c : str) { - switch (c) { - case '"': - buffer += "\\\""; - break; - case '\\': - buffer += "\\\\"; - break; - case '\b': - buffer += "\\b"; - break; - case '\f': - buffer += "\\f"; - break; - case '\n': - buffer += "\\n"; - break; - case '\r': - buffer += "\\r"; - break; - case '\t': - buffer += "\\t"; - break; - default: - if (c >= 32 && c < 127) { - buffer += c; - } else { - char hex[7]; - std::snprintf(hex, sizeof(hex), "\\u%04x", - (unsigned char)c); - buffer += hex; - } - break; + void append_json_escaped(std::string_view str) { + const char* start = str.data(); + const char* end = start + str.size(); + const char* safe_start = start; + for (const char* p = start; p < end; ++p) { + unsigned char c = static_cast(*p); + if (c >= 32 && c < 127 && c != '"' && c != '\\') continue; + if (p > safe_start) { + append(safe_start, static_cast(p - safe_start)); + } + switch (c) { + case '"': + append_literal("\\\""); + break; + case '\\': + append_literal("\\\\"); + break; + case '\b': + append_literal("\\b"); + break; + case '\f': + append_literal("\\f"); + break; + case '\n': + append_literal("\\n"); + break; + case '\r': + append_literal("\\r"); + break; + case '\t': + append_literal("\\t"); + break; + default: + format("\\u%04x", c); + break; + } + safe_start = p + 1; + } + if (end > safe_start) { + append(safe_start, static_cast(end - safe_start)); } } -} -void PerfettoTraceWriterUtility::append_double(std::string& buffer, - double value) const { - char temp[64]; - if (std::abs(value - std::round(value)) < 1e-9) { - std::snprintf(temp, sizeof(temp), "%lld", - static_cast(std::round(value))); - } else { - std::snprintf(temp, sizeof(temp), "%.2f", value); + const char* data() const { return data_.get(); } + std::size_t size() const { return size_; } + std::size_t capacity() const { return capacity_; } + std::size_t remaining() const { return capacity_ - size_; } + bool empty() const { return size_ == 0; } + void clear() { size_ = 0; } + ByteView view() const { return ByteView(data_.get(), size_); } + + private: + std::unique_ptr data_; + std::size_t capacity_; + std::size_t size_; +}; + +class ByteReader { + public: + explicit ByteReader(std::string_view data) : data_(data), off_(0) {} + + std::uint8_t u8() { return static_cast(data_[off_++]); } + + std::uint16_t be16() { + auto hi = static_cast(data_[off_++]); + auto lo = static_cast(data_[off_++]); + return static_cast((hi << 8) | lo); } - buffer += temp; -} -void PerfettoTraceWriterUtility::append_metric_stats( - std::string& buffer, const MetricStats& stats, std::uint64_t count, - bool compute_statistics, bool compute_percentiles, - const std::vector& percentiles) const { - char temp[256]; + void skip(std::size_t n) { off_ += n; } - std::snprintf(temp, sizeof(temp), "\"sum\":%llu", - static_cast(stats.total)); - buffer += temp; + std::uint64_t varint() { + std::uint64_t v = 0; + unsigned shift = 0; + while (off_ < data_.size()) { + auto b = static_cast(data_[off_++]); + v |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) return v; + shift += 7; + } + return v; + } - buffer += ",\"avg\":"; - append_double(buffer, stats.mean); + std::uint64_t be64() { + std::uint64_t v = 0; + for (int i = 0; i < 8; ++i) { + v = (v << 8) | static_cast(data_[off_++]); + } + return v; + } - if (stats.min != std::numeric_limits::max()) { - std::snprintf(temp, sizeof(temp), ",\"min\":%llu", - static_cast(stats.min)); - buffer += temp; + double f64() { + std::uint64_t bits = be64(); + double v; + std::memcpy(&v, &bits, 8); + return v; } - if (stats.max > 0) { - std::snprintf(temp, sizeof(temp), ",\"max\":%llu", - static_cast(stats.max)); - buffer += temp; + std::string_view str() { + auto len_hi = static_cast(data_[off_++]); + auto len_lo = static_cast(data_[off_++]); + std::size_t len = (static_cast(len_hi) << 8) | len_lo; + auto s = data_.substr(off_, len); + off_ += len; + return s; } - if (compute_statistics && count >= 2) { - buffer += ",\"std\":"; - append_double(buffer, stats.get_stddev(count)); + void skip_blob() { + std::uint32_t len = 0; + for (int i = 0; i < 4; ++i) { + len = (len << 8) | static_cast(data_[off_++]); + } + off_ += len; } - if (compute_statistics && count >= 3) { - buffer += ",\"skw\":"; - append_double(buffer, stats.get_skewness(count)); + + std::size_t offset() const { return off_; } + + private: + std::string_view data_; + std::size_t off_; +}; + +inline void emit_metric_stats_from_bytes(ByteReader& r, std::string_view prefix, + bool compute_statistics, + JsonBuffer& buf) { + auto fmt = r.u8(); + if (fmt == METRIC_FMT_COMPACT) { + auto val = r.varint(); + if (val == 0) return; + buf.append_literal(",\""); + buf.append(prefix); + buf.append_literal("_sum\":"); + buf.append_u64(val); + buf.append_literal(",\""); + buf.append(prefix); + buf.append_literal("_min\":"); + buf.append_u64(val); + buf.append_literal(",\""); + buf.append(prefix); + buf.append_literal("_max\":"); + buf.append_u64(val); + return; } - if (compute_statistics && count >= 4) { - buffer += ",\"krt\":"; - append_double(buffer, stats.get_kurtosis(count)); + // FULL or FULL_WITH_SKETCH + auto count = r.varint(); + auto total = r.varint(); + auto min = r.varint(); + auto max = r.varint(); + (void)r.f64(); // mean + auto m2 = r.f64(); + if (fmt == METRIC_FMT_FULL_WITH_SKETCH) { + r.skip_blob(); } - if (compute_percentiles && stats.sketch && !stats.sketch->empty()) { - for (double p : percentiles) { - double percentile_value = stats.sketch->quantile(p); - int p_percent = static_cast(p * 100); - std::snprintf(temp, sizeof(temp), ",\"p%d\":", p_percent); - buffer += temp; - append_double(buffer, percentile_value); - } + buf.append_literal(",\""); + buf.append(prefix); + buf.append_literal("_sum\":"); + buf.append_u64(total); + + if (min != std::numeric_limits::max()) { + buf.append_literal(",\""); + buf.append(prefix); + buf.append_literal("_min\":"); + buf.append_u64(min); + } + if (max > 0) { + buf.append_literal(",\""); + buf.append(prefix); + buf.append_literal("_max\":"); + buf.append_u64(max); } -} -void PerfettoTraceWriterUtility::append_event_args( - std::string& buffer, const AggregationKey& key, - const AggregationMetrics& metrics, bool compute_statistics, - bool compute_percentiles, const std::vector& percentiles, - std::uint64_t real_tid) const { - char temp[512]; - - buffer += "\"hhash\":\""; - append_json_string(buffer, key.hhash()); - buffer += "\""; - - if (real_tid > 0) { - std::snprintf(temp, sizeof(temp), ",\"real_tid\":%llu", - static_cast(real_tid)); - buffer += temp; + if (compute_statistics && count >= 2) { + // `m2` holds the raw power sum + // `sum_x^2`, not Welford's central M2. Convert to central + // moment then to sample variance. Clamp at zero for float + // cancellation. + const double n = static_cast(count); + const double sum_x = static_cast(total); + const double central = m2 - sum_x * sum_x / n; + const double var = (central > 0.0 ? central : 0.0) / (n - 1.0); + const double stddev = var > 0.0 ? std::sqrt(var) : 0.0; + buf.append_literal(",\""); + buf.append(prefix); + buf.append_literal("_std\":"); + buf.append_double(stddev); } +} - if (!key.fhash().empty()) { - buffer += ",\"fhash\":\""; - append_json_string(buffer, key.fhash()); - buffer += "\""; +inline void skip_metric_stats(ByteReader& r) { + auto fmt = r.u8(); + if (fmt == METRIC_FMT_COMPACT) { + r.varint(); + return; } + // FULL: count, total, min, max (varints), 2 doubles (mean, m2) + r.varint(); + r.varint(); + r.varint(); + r.varint(); + r.skip(16); + if (fmt == METRIC_FMT_FULL_WITH_SKETCH) r.skip_blob(); +} - if (key.extra_keys) { - auto& intern = aggregation_intern(); - for (const auto& [k, v] : *key.extra_keys) { - buffer += ",\""; - append_json_string(buffer, intern.resolve(k)); - buffer += "\":\""; - append_json_string(buffer, intern.resolve(v)); - buffer += "\""; +// Compress `data` into a standalone gzip member so the result can be written +// at any offset in a concatenated-gzip file. +coro::CoroTask compress_to_gzip_member(int level, ByteView data, + std::vector& out) { + out.clear(); + compression::zlib::ManualStreamingCompressorUtility comp( + level, compression::zlib::CompressionFormat::GZIP); + if (data.size() > 0) { + auto gen = comp.compress(data); + while (auto view = co_await gen.next()) { + const auto* p = + reinterpret_cast(view->data()); + out.insert(out.end(), p, p + view->size()); } } - - std::snprintf(temp, sizeof(temp), ",\"count\":%llu", - static_cast(metrics.count)); - buffer += temp; - - buffer += ",\"dur\":{"; - append_metric_stats(buffer, metrics.duration, metrics.count, - compute_statistics, compute_percentiles, percentiles); - buffer += "}"; - - if (metrics.size.total > 0) { - buffer += ",\"size\":{"; - append_metric_stats(buffer, metrics.size, metrics.count, - compute_statistics, compute_percentiles, - percentiles); - buffer += "}"; + auto fin = comp.finalize_stream(); + while (auto view = co_await fin.next()) { + const auto* p = reinterpret_cast(view->data()); + out.insert(out.end(), p, p + view->size()); } + co_return true; +} - if (metrics.custom_metrics) - for (const auto& [metric_name, metric_stats] : - *metrics.custom_metrics) { - buffer += ",\""; - append_json_string(buffer, metric_name); - buffer += "\":{"; - append_metric_stats(buffer, metric_stats, metrics.count, - compute_statistics, compute_percentiles, - percentiles); - buffer += "}"; +coro::CoroTask write_shard_events( + std::size_t worker_idx, std::uint16_t shard_begin, std::uint16_t shard_end, + std::size_t flush_threshold, std::size_t buffer_capacity, + fileio::parallel::ParallelWriter* writer, + const PerfettoTraceWriterInput* input) { + using namespace dftracer::utils::utilities; + + JsonBuffer buf(buffer_capacity); + std::vector compressed; + + auto flush_buffer = [&]() -> coro::CoroTask { + if (buf.empty()) co_return 0; + int rc; + if (input->compress) { + co_await compress_to_gzip_member(input->compression_level, + buf.view(), compressed); + rc = co_await writer->write_chunk( + worker_idx, + ByteView(reinterpret_cast(compressed.data()), + compressed.size())); + } else { + rc = co_await writer->write_chunk(worker_idx, buf.view()); } + buf.clear(); + co_return rc; + }; + + std::size_t local_keys = 0; + std::vector pending_chunks; + input->aggregator->scan_shard_range_raw( + shard_begin, shard_end, + [&](std::string_view key_bytes, std::string_view value_bytes) { + local_keys++; + + // Layout: shard(2) map_type(1) cat(varint ID) name(varint ID) + // pid(varint) tid(varint) hhash(varint ID) fhash(varint ID) + // time_bucket(varint) num_extra(2) [k(varint ID) v(varint + // ID)]* + auto& intern = aggregation_intern(); + ByteReader kr(key_bytes); + kr.skip(2); // shard + (void)kr.u8(); // map_type + auto cat = intern.resolve(static_cast(kr.varint())); + auto name = intern.resolve(static_cast(kr.varint())); + auto pid = kr.varint(); + auto tid = kr.varint(); + auto hhash_id = static_cast(kr.varint()); + auto hhash = + hhash_id ? intern.resolve(hhash_id) : std::string_view{}; + auto fhash_id = static_cast(kr.varint()); + auto fhash = + fhash_id ? intern.resolve(fhash_id) : std::string_view{}; + auto time_bucket = kr.varint(); + auto num_extra = kr.be16(); + + // For REGULAR, pre-parse ts/te by skipping through value bytes. + std::uint64_t regular_ts = 0, regular_te = 0; + if (input->format == PerfettoEventFormat::REGULAR) { + ByteReader tmp(value_bytes); + tmp.varint(); // count + skip_metric_stats(tmp); // duration + skip_metric_stats(tmp); // size + regular_ts = tmp.varint(); + regular_te = tmp.varint(); + } + + // Emit event header + if (input->format == PerfettoEventFormat::COUNTER) { + buf.append_literal("{\"name\":\""); + buf.append_json_escaped(name); + buf.append_literal("\",\"cat\":\""); + buf.append_json_escaped(cat); + buf.append_literal("\",\"ts\":"); + buf.append_u64(time_bucket); + buf.append_literal(",\"ph\":\"C\",\"pid\":"); + buf.append_u64(pid); + buf.append_literal(",\"tid\":"); + buf.append_u64(tid); + buf.append_literal(",\"args\":{"); + } else if (input->format == PerfettoEventFormat::REGULAR) { + std::uint64_t duration = regular_te - regular_ts; + buf.append_literal("{\"name\":\""); + buf.append_json_escaped(name); + buf.append_literal("\",\"cat\":\""); + buf.append_json_escaped(cat); + buf.append_literal("\",\"ts\":"); + buf.append_u64(regular_ts); + buf.append_literal(",\"dur\":"); + buf.append_u64(duration); + buf.append_literal(",\"ph\":\"X\",\"pid\":"); + buf.append_u64(pid); + buf.append_literal(",\"tid\":"); + buf.append_u64(tid); + buf.append_literal(",\"args\":{"); + } + + // hhash + buf.append_literal("\"hhash\":\""); + buf.append_json_escaped(hhash); + buf.append_literal("\""); + + // fhash + if (!fhash.empty()) { + buf.append_literal(",\"fhash\":\""); + buf.append_json_escaped(fhash); + buf.append_literal("\""); + } + + // extra keys (varint intern IDs) + for (std::uint16_t i = 0; i < num_extra; ++i) { + auto ek = + intern.resolve(static_cast(kr.varint())); + auto ev = + intern.resolve(static_cast(kr.varint())); + buf.append_literal(",\""); + buf.append_json_escaped(ek); + buf.append_literal("\":\""); + buf.append_json_escaped(ev); + buf.append_literal("\""); + } + + // Value bytes: count, dur, size, ts, te, parent_pid, num_custom, + // customs + ByteReader vr(value_bytes); + auto count = vr.varint(); + + buf.append_literal(",\"dft_cnt\":"); + buf.append_u64(count); + + emit_metric_stats_from_bytes(vr, "dur", input->compute_statistics, + buf); + emit_metric_stats_from_bytes(vr, "ret", input->compute_statistics, + buf); + + auto m_ts = vr.varint(); + auto m_te = vr.varint(); + auto m_parent_pid = vr.varint(); + + // Custom metrics come AFTER ts/te in the stream but BEFORE ts/te + // in the JSON output order, so emit them now. + auto num_custom = vr.varint(); + for (std::uint64_t i = 0; i < num_custom; ++i) { + auto cname = vr.str(); + emit_metric_stats_from_bytes(vr, cname, + input->compute_statistics, buf); + } + + buf.append_literal(",\"ts\":"); + buf.append_u64(m_ts); + buf.append_literal(",\"te\":"); + buf.append_u64(m_te); + + // Compute effective parent_pid (tracker may override) + std::uint64_t effective_parent = m_parent_pid; + if (input->tracker && input->agg_config && + input->agg_config->track_process_parents && + input->tracker->has_process_tree()) { + auto pp = input->tracker->get_parent_pid(pid); + if (pp != 0) effective_parent = pp; + } + + // Boundary associations (emitted between ts/te and parent_pid) + if (input->tracker && input->agg_config && + !input->agg_config->boundary_events.empty() && + input->tracker->has_boundary_events()) { + auto mid = (m_ts + m_te) / 2; + auto bpid = effective_parent > 0 ? effective_parent : pid; + auto assoc = + input->tracker->get_boundary_associations(bpid, mid); + for (const auto& [an, av] : assoc) { + buf.append_literal(",\""); + buf.append_json_escaped(an); + buf.append_literal("\":\""); + buf.append_json_escaped(av); + buf.append_literal("\""); + } + } + + if (effective_parent > 0) { + buf.append_literal(",\"parent_pid\":"); + buf.append_u64(effective_parent); + } - buffer += ",\"ts\":"; - std::snprintf(temp, sizeof(temp), "%llu", - static_cast(metrics.ts)); - buffer += temp; - buffer += ",\"te\":"; - std::snprintf(temp, sizeof(temp), "%llu", - static_cast(metrics.te)); - buffer += temp; - - if (metrics.boundary_associations) - for (const auto& [assoc_name, assoc_value] : - *metrics.boundary_associations) { - buffer += ",\""; - append_json_string(buffer, assoc_name); - buffer += "\":\""; - append_json_string(buffer, assoc_value); - buffer += "\""; + buf.append_literal("}}\n"); + + if (buf.size() >= flush_threshold) { + pending_chunks.emplace_back(buf.data(), buf.size()); + buf.clear(); + } + return true; + }); + + for (auto& s : pending_chunks) { + if (input->compress) { + co_await compress_to_gzip_member( + input->compression_level, + ByteView(reinterpret_cast(s.data()), + s.size()), + compressed); + auto rc = co_await writer->write_chunk( + worker_idx, + ByteView(reinterpret_cast(compressed.data()), + compressed.size())); + if (rc != 0) co_return false; + } else { + auto rc = co_await writer->write_chunk( + worker_idx, + ByteView(reinterpret_cast(s.data()), + s.size())); + if (rc != 0) co_return false; } + } + + if (co_await flush_buffer() != 0) co_return false; - if (metrics.parent_pid > 0) { - std::snprintf(temp, sizeof(temp), ",\"parent_pid\":%llu", - static_cast(metrics.parent_pid)); - buffer += temp; + if (input->keys_written) { + input->keys_written->fetch_add(local_keys, std::memory_order_relaxed); } + co_return true; } +} // namespace + coro::CoroTask PerfettoTraceWriterUtility::process( const PerfettoTraceWriterInput& input) { - const auto& aggregations = input.resolver_output.aggregations.aggregations; - const auto& root_pids = input.resolver_output.root_pids; + using namespace dftracer::utils::utilities; + + constexpr std::size_t HEADER_BUFFER_BYTES = 4 * 1024 * 1024; + constexpr std::size_t DEFAULT_FLUSH_BYTES = 12 * 1024 * 1024; + constexpr std::size_t BUFFER_HEADROOM_BYTES = 4 * 1024 * 1024; + + auto layout_info = fileio::parallel::detect_layout(input.output_path); + const std::size_t executor_threads = + this->context().get_executor()->get_num_threads(); + const std::size_t baseline = + std::min(executor_threads, AGG_KEY_NUM_SHARDS); + // Mirror make_writer's padded-layout gate so sizing picks the matching + // flush_threshold. + const bool uses_padded = + layout_info.layout == fileio::parallel::FileLayout::STRIPED && + input.compress && + layout_info.stripe_size >= fileio::parallel::MIN_PADDED_STRIPE_BYTES; + const auto sizing = fileio::parallel::compute_writer_sizing( + layout_info, baseline, DEFAULT_FLUSH_BYTES, BUFFER_HEADROOM_BYTES, + uses_padded); + const std::size_t num_workers = sizing.num_workers; + const std::size_t flush_threshold = sizing.flush_threshold; + const std::size_t buffer_capacity = sizing.buffer_capacity; + fileio::parallel::WriterConfig wcfg; + wcfg.layout = layout_info.layout; + wcfg.stripe_size = layout_info.stripe_size; + wcfg.gzip = input.compress; + auto writer = fileio::parallel::make_writer(wcfg); + if (co_await writer->open(input.output_path, num_workers, input.compress, + &this->context()) != 0) { + co_return false; + } - std::string buffer; - buffer.reserve(1024 * 1024); + auto write_section = [&](ByteView data, + bool is_footer) -> coro::CoroTask { + std::vector compressed; + ByteView payload = data; + if (input.compress) { + co_await compress_to_gzip_member(input.compression_level, data, + compressed); + payload = + ByteView(reinterpret_cast(compressed.data()), + compressed.size()); + } + int rc = is_footer ? co_await writer->write_footer(payload) + : co_await writer->write_header(payload); + co_return rc == 0; + }; - buffer += "[\n"; + JsonBuffer header(HEADER_BUFFER_BYTES); + if (input.emit_header) header.append_literal("[\n"); - if (input.resolver_output.trace_duration > 0 || - !input.resolver_output.boundary_ranges.empty()) { - buffer += + if (input.emit_header && + (input.trace_duration > 0 || !input.boundary_ranges.empty())) { + header.append_literal( "{\"name\":\"trace_metadata\",\"cat\":\"metadata\",\"ph\":" - "\"M\",\"args\":{"; - - char temp[512]; - std::snprintf(temp, sizeof(temp), "\"trace_duration\":%llu", - static_cast( - input.resolver_output.trace_duration)); - buffer += temp; + "\"M\",\"args\":{"); + header.format("\"trace_duration\":%llu", + static_cast(input.trace_duration)); - if (!input.resolver_output.boundary_ranges.empty()) { - buffer += ",\"boundary_ranges\":{"; + if (!input.boundary_ranges.empty()) { + header.append_literal(",\"boundary_ranges\":{"); bool first_boundary = true; for (const auto& [boundary_name, value_map] : - input.resolver_output.boundary_ranges) { - if (!first_boundary) { - buffer += ","; - } + input.boundary_ranges) { + if (!first_boundary) header.append_literal(","); first_boundary = false; - - buffer += "\""; - append_json_string(buffer, boundary_name); - buffer += "\":{"; - + header.append_literal("\""); + header.append_json_escaped(boundary_name); + header.append_literal("\":{"); bool first_value = true; for (const auto& [value, time_range] : value_map) { - if (!first_value) { - buffer += ","; - } + if (!first_value) header.append_literal(","); first_value = false; - - buffer += "\""; - append_json_string(buffer, value); - buffer += "\":{"; - - std::snprintf( - temp, sizeof(temp), "\"ts\":%llu,\"te\":%llu", + header.append_literal("\""); + header.append_json_escaped(value); + header.append_literal("\":{"); + header.format( + "\"ts\":%llu,\"te\":%llu", static_cast(time_range.ts), static_cast(time_range.te)); - buffer += temp; - - buffer += "}"; + header.append_literal("}"); } - - buffer += "}"; + header.append_literal("}"); } - - buffer += "}"; + header.append_literal("}"); } - - buffer += "}}\n"; + header.append_literal("}}\n"); } - if (!root_pids.empty()) { - for (std::uint64_t pid : root_pids) { - buffer += + if (input.emit_header) { + for (std::uint64_t pid : input.root_pids) { + header.format( "{\"name\":\"root_process\",\"cat\":\"dftracer\",\"ph\":" - "\"M\",\"pid\":"; - buffer += std::to_string(pid); - buffer += ",\"tid\":"; - buffer += std::to_string(pid); - buffer += ",\"args\":{\"is_root\":\"true\"}}\n"; + "\"M\",\"pid\":%llu,\"tid\":%llu," + "\"args\":{\"is_root\":\"true\"}}\n", + static_cast(pid), + static_cast(pid)); } } - for (const auto& [key, metrics] : aggregations) { - char temp[512]; - - if (input.format == PerfettoEventFormat::COUNTER) { - buffer += "{\"name\":\""; - append_json_string(buffer, key.name()); - buffer += "\",\"cat\":\""; - append_json_string(buffer, key.cat()); - std::snprintf(temp, sizeof(temp), - "\",\"ts\":%llu,\"ph\":\"C\",\"pid\":%llu," - "\"tid\":%llu,\"args\":{", - static_cast(key.time_bucket), - static_cast(key.pid), - static_cast(key.tid)); - buffer += temp; - append_event_args(buffer, key, metrics, input.compute_statistics, - input.compute_percentiles, input.percentiles); - buffer += "}}\n"; - - } else if (input.format == PerfettoEventFormat::REGULAR) { - std::uint64_t duration = metrics.te - metrics.ts; - - buffer += "{\"name\":\""; - append_json_string(buffer, key.name()); - buffer += "\",\"cat\":\""; - append_json_string(buffer, key.cat()); - std::snprintf( - temp, sizeof(temp), - "\",\"ts\":%llu,\"dur\":%llu,\"ph\":\"X\",\"pid\":%llu," - "\"tid\":%llu,\"args\":{", - static_cast(metrics.ts), - static_cast(duration), - static_cast(key.pid), - static_cast(key.tid)); - buffer += temp; - append_event_args(buffer, key, metrics, input.compute_statistics, - input.compute_percentiles, input.percentiles); - buffer += "}}\n"; - - } else { - std::string event_id = - std::string(key.cat()) + ":" + std::string(key.name()) + ":" + - std::to_string(key.pid) + ":" + std::to_string(key.tid) + ":" + - std::to_string(key.time_bucket); - if (!key.fhash().empty()) { - event_id += ":"; - event_id += key.fhash(); - } - if (key.extra_keys) { - auto& intern = aggregation_intern(); - for (const auto& [k, v] : *key.extra_keys) { - event_id += ":"; - event_id += intern.resolve(k); - event_id += "="; - event_id += intern.resolve(v); - } + if (!co_await write_section(header.view(), false)) co_return false; + + std::atomic worker_success{true}; + const std::uint16_t range_begin = input.shard_begin; + const std::uint16_t range_end = + input.shard_end == 0 ? AGG_KEY_NUM_SHARDS : input.shard_end; + const std::uint16_t range_width = + range_end > range_begin + ? static_cast(range_end - range_begin) + : std::uint16_t{0}; + std::uint16_t shards_per_worker = + num_workers > 0 ? static_cast(range_width / num_workers) + : std::uint16_t{0}; + + co_await this->context().scope( + [&](CoroScope& child) -> coro::CoroTask { + for (std::size_t i = 0; i < num_workers; ++i) { + auto shard_begin = static_cast( + range_begin + i * shards_per_worker); + auto shard_end = + (i + 1 == num_workers) + ? range_end + : static_cast( + range_begin + (i + 1) * shards_per_worker); + const auto* input_ptr = &input; + auto* success_ptr = &worker_success; + auto* writer_ptr = writer.get(); + child.spawn([i, shard_begin, shard_end, flush_threshold, + buffer_capacity, writer_ptr, input_ptr, + success_ptr](CoroScope&) -> coro::CoroTask { + auto ok = co_await write_shard_events( + i, shard_begin, shard_end, flush_threshold, + buffer_capacity, writer_ptr, input_ptr); + if (!ok) success_ptr->store(false); + }); } + co_return; + }); - buffer += "{\"name\":\""; - append_json_string(buffer, key.name()); - buffer += "\",\"cat\":\""; - append_json_string(buffer, key.cat()); - std::snprintf(temp, sizeof(temp), - "\",\"ts\":%llu,\"ph\":\"b\",\"pid\":%llu," - "\"tid\":%llu,\"id\":\"", - static_cast(metrics.ts), - static_cast(key.pid), - static_cast(key.tid)); - buffer += temp; - append_json_string(buffer, event_id); - buffer += "\",\"args\":{"; - append_event_args(buffer, key, metrics, input.compute_statistics, - input.compute_percentiles, input.percentiles); - buffer += "}}\n"; - - buffer += "{\"name\":\""; - append_json_string(buffer, key.name()); - buffer += "\",\"cat\":\""; - append_json_string(buffer, key.cat()); - std::snprintf(temp, sizeof(temp), - "\",\"ts\":%llu,\"ph\":\"e\",\"pid\":%llu," - "\"tid\":%llu,\"id\":\"", - static_cast(metrics.te), - static_cast(key.pid), - static_cast(key.tid)); - buffer += temp; - append_json_string(buffer, event_id); - buffer += "\"}\n"; - } + if (!worker_success.load()) { + co_await writer->close(); + co_return false; } - buffer += "]\n"; - - try { - if (input.compress) { - using namespace dftracer::utils::utilities; - - compression::zlib::ManualStreamingCompressorUtility compressor( - input.compression_level, - compression::zlib::CompressionFormat::GZIP); - - fileio::StreamingFileWriterUtility writer(input.output_path); + if (input.emit_footer) { + const char footer[] = "]\n"; + if (!co_await write_section( + ByteView(reinterpret_cast(footer), 2), true)) + co_return false; + } - { - auto gen = compressor.compress(ByteView(buffer)); - while (auto chunk = co_await gen.next()) { - co_await writer.process(*chunk); - } - } - { - auto gen = compressor.finalize_stream(); - while (auto chunk = co_await gen.next()) { - co_await writer.process(*chunk); - } - } + if (co_await writer->close() != 0) co_return false; - writer.close(); - } else { - ssize_t fd = co_await ::dftracer::utils::io::open( - input.output_path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); - if (fd < 0) { - DFTRACER_UTILS_LOG_ERROR("Failed to open output file: %s", - input.output_path.c_str()); - co_return false; - } - co_await ::dftracer::utils::io::write(static_cast(fd), - buffer.data(), buffer.size()); - co_await ::dftracer::utils::io::close(static_cast(fd)); + if (input.merge_on_sharded && + layout_info.layout == fileio::parallel::FileLayout::SHARDED) { + auto shards = writer->output_paths(); + if (co_await fileio::parallel::merge_shards(input.output_path, + shards) != 0) { + co_return false; } - - co_return true; - } catch (const std::exception& e) { - DFTRACER_UTILS_LOG_ERROR("Failed to write output: %s", e.what()); - co_return false; } + + co_return true; } } // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp new file mode 100644 index 00000000..6829db00 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp @@ -0,0 +1,54 @@ +#include +#include + +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +bool SystemMetricsMergeOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + SystemAggregationMetrics result; + + if (merge_in.existing_value) { + try { + result = deserialize_system_value( + std::string_view(merge_in.existing_value->data(), + merge_in.existing_value->size())); + } catch (...) { + return false; + } + } + + for (const auto& operand : merge_in.operand_list) { + try { + auto other = deserialize_system_value( + std::string_view(operand.data(), operand.size())); + result.merge_from(other); + } catch (...) { + return false; + } + } + + merge_out->new_value = serialize_system_value(result); + return true; +} + +bool SystemMetricsMergeOperator::PartialMerge( + const ::rocksdb::Slice& /*key*/, const ::rocksdb::Slice& left_operand, + const ::rocksdb::Slice& right_operand, std::string* new_value, + ::rocksdb::Logger* /*logger*/) const { + try { + auto left = deserialize_system_value( + std::string_view(left_operand.data(), left_operand.size())); + auto right = deserialize_system_value( + std::string_view(right_operand.data(), right_operand.size())); + left.merge_from(right); + *new_value = serialize_system_value(left); + return true; + } catch (...) { + return false; + } +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp new file mode 100644 index 00000000..d2db20c7 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp @@ -0,0 +1,126 @@ +#include +#include + +#include + +namespace dftracer::utils::utilities::composites::dft::aggregators { + +namespace { + +using common::serialization::BinaryReader; +using common::serialization::put_blob; +using common::serialization::put_double; +using common::serialization::put_str; +using common::serialization::put_varint; + +void serialize_float_metric_stats(std::string& out, + const FloatMetricStats& ms) { + put_varint(out, ms.count); + put_double(out, ms.total); + put_double(out, ms.min); + put_double(out, ms.max); + put_double(out, ms.mean); + put_double(out, ms.m2); + // m3/m4 not persisted yet -- skewness/kurtosis are recomputed + // in-memory only. + // put_double(out, ms.m3); + // put_double(out, ms.m4); + if (ms.sketch) { + out.push_back(1); + auto blob = ms.sketch->serialize(); + put_blob(out, blob); + } else { + out.push_back(0); + } +} + +FloatMetricStats deserialize_float_metric_stats(BinaryReader& r, + double accuracy) { + FloatMetricStats ms(accuracy); + ms.count = r.varint(); + ms.total = r.f64(); + ms.min = r.f64(); + ms.max = r.f64(); + ms.mean = r.f64(); + ms.m2 = r.f64(); + // ms.m3 = r.f64(); + // ms.m4 = r.f64(); + if (r.u8()) { + auto blob = r.blob(); + ms.sketch = std::make_unique(DDSketch::deserialize( + reinterpret_cast(blob.data()), blob.size())); + } + return ms; +} + +} // namespace + +void serialize_system_key_into(std::string& out, std::string_view hhash, + std::uint64_t time_bucket) { + out.clear(); + out.reserve(2 + hhash.size() + 10); + put_str(out, hhash); + put_varint(out, time_bucket); +} + +std::string serialize_system_key(std::string_view hhash, + std::uint64_t time_bucket) { + std::string out; + serialize_system_key_into(out, hhash, time_bucket); + return out; +} + +DeserializedSystemKey deserialize_system_key(std::string_view data) { + BinaryReader r(data); + auto hhash = r.str(); + auto time_bucket = r.varint(); + return {{std::string(hhash), time_bucket}}; +} + +void serialize_system_value_into(std::string& out, + const SystemAggregationMetrics& m) { + out.clear(); + out.reserve(128); + + put_varint(out, m.count); + put_varint(out, m.ts); + put_varint(out, m.te); + + std::uint32_t num_metrics = + m.metrics ? static_cast(m.metrics->size()) : 0; + put_varint(out, num_metrics); + + if (m.metrics) { + for (const auto& [name, stats] : *m.metrics) { + put_str(out, name); + serialize_float_metric_stats(out, stats); + } + } +} + +std::string serialize_system_value(const SystemAggregationMetrics& m) { + std::string out; + serialize_system_value_into(out, m); + return out; +} + +SystemAggregationMetrics deserialize_system_value(std::string_view data) { + BinaryReader r(data); + SystemAggregationMetrics m; + m.count = r.varint(); + m.ts = r.varint(); + m.te = r.varint(); + + auto num_metrics = r.varint(); + if (num_metrics > 0) { + m.metrics = std::make_unique(); + for (std::uint32_t i = 0; i < num_metrics; ++i) { + auto name = r.str(); + auto stats = deserialize_float_metric_stats(r, m.sketch_accuracy); + m.metrics->emplace(std::string(name), std::move(stats)); + } + } + return m; +} + +} // namespace dftracer::utils::utilities::composites::dft::aggregators diff --git a/src/dftracer/utils/utilities/composites/dft/comparator/comparison_config.cpp b/src/dftracer/utils/utilities/composites/dft/comparator/comparison_config.cpp index cc757d53..64743716 100644 --- a/src/dftracer/utils/utilities/composites/dft/comparator/comparison_config.cpp +++ b/src/dftracer/utils/utilities/composites/dft/comparator/comparison_config.cpp @@ -1,6 +1,7 @@ #include -#include +#include +#include #include #include #include @@ -27,78 +28,74 @@ std::vector split_csv(const std::string& s) { } // namespace // static -bool ComparisonConfig::parse_node(void* yyjson_val_ptr, ComparisonNode& node, - std::string& error) { - auto* val = static_cast(yyjson_val_ptr); - if (!val || !yyjson_is_obj(val)) { +bool ComparisonConfig::parse_node(simdjson::dom::element val, + ComparisonNode& node, std::string& error) { + if (!val.is_object()) { error = "node must be a JSON object"; return false; } - yyjson_val* name_val = yyjson_obj_get(val, "name"); - if (!name_val || !yyjson_is_str(name_val)) { + auto name_result = val["name"]; + if (name_result.error() || !name_result.value_unsafe().is_string()) { error = "node missing required string field 'name'"; return false; } - node.name = yyjson_get_str(name_val); + node.name = std::string(name_result.value_unsafe().get_string().value()); - yyjson_val* query_val = yyjson_obj_get(val, "query"); - if (query_val && yyjson_is_str(query_val)) { - node.query = yyjson_get_str(query_val); + auto query_result = val["query"]; + if (!query_result.error() && query_result.value_unsafe().is_string()) { + node.query = + std::string(query_result.value_unsafe().get_string().value()); } - yyjson_val* gb_val = yyjson_obj_get(val, "group_by"); - if (gb_val && yyjson_is_arr(gb_val)) { - std::size_t idx, max; - yyjson_val* elem; - yyjson_arr_foreach(gb_val, idx, max, elem) { - if (yyjson_is_str(elem)) { - node.group_by.push_back(yyjson_get_str(elem)); + auto gb_result = val["group_by"]; + if (!gb_result.error() && gb_result.value_unsafe().is_array()) { + for (auto elem : gb_result.value_unsafe().get_array()) { + if (elem.is_string()) { + node.group_by.push_back(std::string(elem.get_string().value())); } } } - yyjson_val* metrics_val = yyjson_obj_get(val, "metrics"); - if (metrics_val && yyjson_is_arr(metrics_val)) { + auto metrics_result = val["metrics"]; + if (!metrics_result.error() && metrics_result.value_unsafe().is_array()) { std::vector metrics; - std::size_t idx, max; - yyjson_val* elem; - yyjson_arr_foreach(metrics_val, idx, max, elem) { - if (yyjson_is_str(elem)) { - metrics.push_back(yyjson_get_str(elem)); + for (auto elem : metrics_result.value_unsafe().get_array()) { + if (elem.is_string()) { + metrics.push_back(std::string(elem.get_string().value())); } } node.metrics = std::move(metrics); } - yyjson_val* pct_val = yyjson_obj_get(val, "percentiles"); - if (pct_val && yyjson_is_arr(pct_val)) { + auto pct_result = val["percentiles"]; + if (!pct_result.error() && pct_result.value_unsafe().is_array()) { std::vector percentiles; - std::size_t idx, max; - yyjson_val* elem; - yyjson_arr_foreach(pct_val, idx, max, elem) { - if (yyjson_is_num(elem)) { - percentiles.push_back(yyjson_get_num(elem)); + for (auto elem : pct_result.value_unsafe().get_array()) { + if (elem.is_double() || elem.is_int64() || elem.is_uint64()) { + percentiles.push_back(elem.get_double().value()); } } node.percentiles = std::move(percentiles); } - yyjson_val* thr_val = yyjson_obj_get(val, "threshold_pct"); - if (thr_val && yyjson_is_num(thr_val)) { - node.threshold_pct = yyjson_get_num(thr_val); + auto thr_result = val["threshold_pct"]; + if (!thr_result.error()) { + auto thr_val = thr_result.value_unsafe(); + if (thr_val.is_double() || thr_val.is_int64() || thr_val.is_uint64()) { + node.threshold_pct = thr_val.get_double().value(); + } } - yyjson_val* sort_val = yyjson_obj_get(val, "sort_by"); - if (sort_val && yyjson_is_str(sort_val)) { - node.sort_by = yyjson_get_str(sort_val); + auto sort_result = val["sort_by"]; + if (!sort_result.error() && sort_result.value_unsafe().is_string()) { + node.sort_by = + std::string(sort_result.value_unsafe().get_string().value()); } - yyjson_val* children_val = yyjson_obj_get(val, "children"); - if (children_val && yyjson_is_arr(children_val)) { - std::size_t idx, max; - yyjson_val* child_elem; - yyjson_arr_foreach(children_val, idx, max, child_elem) { + auto children_result = val["children"]; + if (!children_result.error() && children_result.value_unsafe().is_array()) { + for (auto child_elem : children_result.value_unsafe().get_array()) { ComparisonNode child; if (!parse_node(child_elem, child, error)) return false; node.children.push_back(std::move(child)); @@ -111,94 +108,107 @@ bool ComparisonConfig::parse_node(void* yyjson_val_ptr, ComparisonNode& node, // static std::optional ComparisonConfig::from_json_file( const std::string& path, std::string& error) { - yyjson_doc* doc = yyjson_read_file(path.c_str(), 0, nullptr, nullptr); - if (!doc) { + std::ifstream file(path); + if (!file) { error = "failed to read or parse JSON file: " + path; return std::nullopt; } + std::string content((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); + + simdjson::dom::parser parser; + auto result = parser.parse(content); + if (result.error()) { + error = "failed to parse JSON file: " + path; + return std::nullopt; + } - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); + auto root = result.value_unsafe(); + if (!root.is_object()) { error = "JSON root must be an object"; return std::nullopt; } ComparisonConfig cfg; - yyjson_val* baseline_val = yyjson_obj_get(root, "baseline"); - if (!baseline_val || !yyjson_is_str(baseline_val)) { - yyjson_doc_free(doc); + auto baseline_result = root["baseline"]; + if (baseline_result.error() || + !baseline_result.value_unsafe().is_string()) { error = "missing required string field 'baseline'"; return std::nullopt; } - cfg.baseline = yyjson_get_str(baseline_val); + cfg.baseline = + std::string(baseline_result.value_unsafe().get_string().value()); - yyjson_val* variant_val = yyjson_obj_get(root, "variant"); - if (!variant_val || !yyjson_is_str(variant_val)) { - yyjson_doc_free(doc); + auto variant_result = root["variant"]; + if (variant_result.error() || !variant_result.value_unsafe().is_string()) { error = "missing required string field 'variant'"; return std::nullopt; } - cfg.variant = yyjson_get_str(variant_val); + cfg.variant = + std::string(variant_result.value_unsafe().get_string().value()); - yyjson_val* defaults_val = yyjson_obj_get(root, "defaults"); - if (defaults_val && yyjson_is_obj(defaults_val)) { - yyjson_val* dm = yyjson_obj_get(defaults_val, "metrics"); - if (dm && yyjson_is_arr(dm)) { + auto defaults_result = root["defaults"]; + if (!defaults_result.error() && + defaults_result.value_unsafe().is_object()) { + auto defaults_val = defaults_result.value_unsafe(); + + auto dm_result = defaults_val["metrics"]; + if (!dm_result.error() && dm_result.value_unsafe().is_array()) { cfg.defaults.metrics.clear(); - std::size_t idx, max; - yyjson_val* elem; - yyjson_arr_foreach(dm, idx, max, elem) { - if (yyjson_is_str(elem)) { - cfg.defaults.metrics.push_back(yyjson_get_str(elem)); + for (auto elem : dm_result.value_unsafe().get_array()) { + if (elem.is_string()) { + cfg.defaults.metrics.push_back( + std::string(elem.get_string().value())); } } } - yyjson_val* dp = yyjson_obj_get(defaults_val, "percentiles"); - if (dp && yyjson_is_arr(dp)) { + auto dp_result = defaults_val["percentiles"]; + if (!dp_result.error() && dp_result.value_unsafe().is_array()) { cfg.defaults.percentiles.clear(); - std::size_t idx, max; - yyjson_val* elem; - yyjson_arr_foreach(dp, idx, max, elem) { - if (yyjson_is_num(elem)) { - cfg.defaults.percentiles.push_back(yyjson_get_num(elem)); + for (auto elem : dp_result.value_unsafe().get_array()) { + if (elem.is_double() || elem.is_int64() || elem.is_uint64()) { + cfg.defaults.percentiles.push_back( + elem.get_double().value()); } } } - yyjson_val* dt = yyjson_obj_get(defaults_val, "threshold_pct"); - if (dt && yyjson_is_num(dt)) { - cfg.defaults.threshold_pct = yyjson_get_num(dt); + auto dt_result = defaults_val["threshold_pct"]; + if (!dt_result.error()) { + auto dt_val = dt_result.value_unsafe(); + if (dt_val.is_double() || dt_val.is_int64() || dt_val.is_uint64()) { + cfg.defaults.threshold_pct = dt_val.get_double().value(); + } } - yyjson_val* ti = yyjson_obj_get(defaults_val, "time_interval_ms"); - if (ti && yyjson_is_num(ti)) { - cfg.defaults.time_interval_ms = yyjson_get_num(ti); + auto ti_result = defaults_val["time_interval_ms"]; + if (!ti_result.error()) { + auto ti_val = ti_result.value_unsafe(); + if (ti_val.is_double() || ti_val.is_int64() || ti_val.is_uint64()) { + cfg.defaults.time_interval_ms = ti_val.get_double().value(); + } } - yyjson_val* ds = yyjson_obj_get(defaults_val, "sort_by"); - if (ds && yyjson_is_str(ds)) { - cfg.defaults.sort_by = yyjson_get_str(ds); + auto ds_result = defaults_val["sort_by"]; + if (!ds_result.error() && ds_result.value_unsafe().is_string()) { + cfg.defaults.sort_by = + std::string(ds_result.value_unsafe().get_string().value()); } } - yyjson_val* nodes_val = yyjson_obj_get(root, "nodes"); - if (nodes_val && yyjson_is_arr(nodes_val)) { - std::size_t idx, max; - yyjson_val* node_elem; - yyjson_arr_foreach(nodes_val, idx, max, node_elem) { + auto nodes_result = root["nodes"]; + if (!nodes_result.error() && nodes_result.value_unsafe().is_array()) { + for (auto node_elem : nodes_result.value_unsafe().get_array()) { ComparisonNode node; if (!parse_node(node_elem, node, error)) { - yyjson_doc_free(doc); return std::nullopt; } cfg.nodes.push_back(std::move(node)); } } - yyjson_doc_free(doc); return cfg; } diff --git a/src/dftracer/utils/utilities/composites/dft/comparator/comparison_result.cpp b/src/dftracer/utils/utilities/composites/dft/comparator/comparison_result.cpp index 06f5c829..f1e88fa5 100644 --- a/src/dftracer/utils/utilities/composites/dft/comparator/comparison_result.cpp +++ b/src/dftracer/utils/utilities/composites/dft/comparator/comparison_result.cpp @@ -87,8 +87,16 @@ std::vector build_metadata_metrics( double compute_cohens_d(const MetricStats& base, std::uint64_t n_base, const MetricStats& var, std::uint64_t n_var) { if (n_base < 2 || n_var < 2) return 0.0; - double var_base = base.m2 / static_cast(n_base); - double var_var = var.m2 / static_cast(n_var); + // `m2` now holds the raw power sum sum_x^2 (not Welford central M2). + // Convert to population variance: Var = (sum_x^2 - (sum_x)^2 / n) / n. + auto pop_var = [](const MetricStats& ms, std::uint64_t n) { + const double nd = static_cast(n); + const double sx = static_cast(ms.total); + const double central = ms.m2 - sx * sx / nd; + return (central > 0.0 ? central : 0.0) / nd; + }; + double var_base = pop_var(base, n_base); + double var_var = pop_var(var, n_var); double pooled = std::sqrt((var_base + var_var) / 2.0); if (pooled < 1e-15) return 0.0; return (var.mean - base.mean) / pooled; diff --git a/src/dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.cpp b/src/dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.cpp index 6be146b4..8fb0c65e 100644 --- a/src/dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.cpp +++ b/src/dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.cpp @@ -1,5 +1,5 @@ +#include #include -#include #include #include @@ -625,81 +625,119 @@ const char* sig_str(Significance s) { return "NEGLIGIBLE"; } -yyjson_mut_val* build_metric_json(yyjson_mut_doc* doc, - const MetricComparison& mc) { +std::string escape_json_string(const std::string& s) { + std::string result; + result.reserve(s.size()); + for (char c : s) { + switch (c) { + case '"': + result += "\\\""; + break; + case '\\': + result += "\\\\"; + break; + case '\b': + result += "\\b"; + break; + case '\f': + result += "\\f"; + break; + case '\n': + result += "\\n"; + break; + case '\r': + result += "\\r"; + break; + case '\t': + result += "\\t"; + break; + default: + result += c; + break; + } + } + return result; +} + +std::string double_to_json(double v) { + if (!std::isfinite(v)) return "0"; + char buf[32]; + std::snprintf(buf, sizeof(buf), "%.15g", v); + return buf; +} + +void build_metric_json(std::ostringstream& out, const MetricComparison& mc) { auto safe = [](double v) { return std::isfinite(v) ? v : 0.0; }; - yyjson_mut_val* obj = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, obj, "name", mc.metric_name.c_str()); - yyjson_mut_obj_add_real(doc, obj, "baseline", safe(mc.baseline_value)); - yyjson_mut_obj_add_real(doc, obj, "variant", safe(mc.variant_value)); - yyjson_mut_obj_add_real(doc, obj, "delta", safe(mc.delta)); - yyjson_mut_obj_add_real(doc, obj, "pct_change", safe(mc.pct_change)); - yyjson_mut_obj_add_real(doc, obj, "cohens_d", safe(mc.cohens_d)); - yyjson_mut_obj_add_str(doc, obj, "significance", sig_str(mc.significance)); - yyjson_mut_obj_add_bool(doc, obj, "is_regression", mc.is_regression); - return obj; -} - -yyjson_mut_val* build_metrics_arr(yyjson_mut_doc* doc, - const std::vector& ms) { - yyjson_mut_val* arr = yyjson_mut_arr(doc); - for (const auto& mc : ms) { - yyjson_mut_arr_append(arr, build_metric_json(doc, mc)); + out << "{"; + out << "\"name\":\"" << escape_json_string(mc.metric_name) << "\","; + out << "\"baseline\":" << double_to_json(safe(mc.baseline_value)) << ","; + out << "\"variant\":" << double_to_json(safe(mc.variant_value)) << ","; + out << "\"delta\":" << double_to_json(safe(mc.delta)) << ","; + out << "\"pct_change\":" << double_to_json(safe(mc.pct_change)) << ","; + out << "\"cohens_d\":" << double_to_json(safe(mc.cohens_d)) << ","; + out << "\"significance\":\"" << sig_str(mc.significance) << "\","; + out << "\"is_regression\":" << (mc.is_regression ? "true" : "false"); + out << "}"; +} + +void build_metrics_arr(std::ostringstream& out, + const std::vector& ms) { + out << "["; + for (std::size_t i = 0; i < ms.size(); ++i) { + if (i > 0) out << ","; + build_metric_json(out, ms[i]); } - return arr; + out << "]"; } -yyjson_mut_val* build_group_json(yyjson_mut_doc* doc, - const GroupComparison& g) { - yyjson_mut_val* obj = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, obj, "label", g.label.c_str()); - yyjson_mut_obj_add_val(doc, obj, "metrics", - build_metrics_arr(doc, g.metrics)); - return obj; +void build_group_json(std::ostringstream& out, const GroupComparison& g) { + out << "{"; + out << "\"label\":\"" << escape_json_string(g.label) << "\","; + out << "\"metrics\":"; + build_metrics_arr(out, g.metrics); + out << "}"; } -yyjson_mut_val* build_node_json(yyjson_mut_doc* doc, const NodeResult& node); +void build_node_json(std::ostringstream& out, const NodeResult& node); -yyjson_mut_val* build_node_json(yyjson_mut_doc* doc, const NodeResult& node) { - yyjson_mut_val* obj = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, obj, "name", node.name.c_str()); - yyjson_mut_obj_add_str(doc, obj, "query", node.composed_query.c_str()); +void build_node_json(std::ostringstream& out, const NodeResult& node) { + out << "{"; + out << "\"name\":\"" << escape_json_string(node.name) << "\","; + out << "\"query\":\"" << escape_json_string(node.composed_query) << "\","; // summary - yyjson_mut_val* summary = yyjson_mut_obj(doc); - yyjson_mut_obj_add_val(doc, summary, "metrics", - build_metrics_arr(doc, node.summary.metrics)); - yyjson_mut_obj_add_val(doc, obj, "summary", summary); + out << "\"summary\":{\"metrics\":"; + build_metrics_arr(out, node.summary.metrics); + out << "},"; // groups - yyjson_mut_val* groups_arr = yyjson_mut_arr(doc); - for (const auto& g : node.groups) { - yyjson_mut_arr_append(groups_arr, build_group_json(doc, g)); + out << "\"groups\":["; + for (std::size_t i = 0; i < node.groups.size(); ++i) { + if (i > 0) out << ","; + build_group_json(out, node.groups[i]); } - yyjson_mut_obj_add_val(doc, obj, "groups", groups_arr); + out << "],"; // children - yyjson_mut_val* children_arr = yyjson_mut_arr(doc); - for (const auto& child : node.children) { - yyjson_mut_arr_append(children_arr, build_node_json(doc, child)); + out << "\"children\":["; + for (std::size_t i = 0; i < node.children.size(); ++i) { + if (i > 0) out << ","; + build_node_json(out, node.children[i]); } - yyjson_mut_obj_add_val(doc, obj, "children", children_arr); + out << "]"; - return obj; + out << "}"; } -yyjson_mut_val* build_meta_json(yyjson_mut_doc* doc, const TraceMetadata& m) { - yyjson_mut_val* obj = yyjson_mut_obj(doc); - yyjson_mut_obj_add_int(doc, obj, "files", - static_cast(m.file_count)); - yyjson_mut_obj_add_int(doc, obj, "processes", - static_cast(m.process_count)); - yyjson_mut_obj_add_int(doc, obj, "threads", - static_cast(m.thread_count)); - yyjson_mut_obj_add_real(doc, obj, "total_bytes", m.total_bytes); - yyjson_mut_obj_add_real(doc, obj, "total_io_time_us", m.total_io_time_us); - yyjson_mut_obj_add_real(doc, obj, "makespan_us", m.makespan_us); - return obj; +void build_meta_json(std::ostringstream& out, const TraceMetadata& m) { + out << "{"; + out << "\"files\":" << m.file_count << ","; + out << "\"processes\":" << m.process_count << ","; + out << "\"threads\":" << m.thread_count << ","; + out << "\"total_bytes\":" << double_to_json(m.total_bytes) << ","; + out << "\"total_io_time_us\":" << double_to_json(m.total_io_time_us) << ","; + out << "\"makespan_us\":" << double_to_json(m.makespan_us); + out << "}"; } } // namespace @@ -710,40 +748,31 @@ yyjson_mut_val* build_meta_json(yyjson_mut_doc* doc, const TraceMetadata& m) { std::string TreeTableFormatter::render_json( const ComparisonOutput& output) const { - yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr); - yyjson_mut_val* root = yyjson_mut_obj(doc); - yyjson_mut_doc_set_root(doc, root); - - yyjson_mut_obj_add_str(doc, root, "baseline", output.baseline_path.c_str()); - yyjson_mut_obj_add_str(doc, root, "variant", output.variant_path.c_str()); - yyjson_mut_obj_add_val(doc, root, "baseline_meta", - build_meta_json(doc, output.baseline_meta)); - yyjson_mut_obj_add_val(doc, root, "variant_meta", - build_meta_json(doc, output.variant_meta)); - yyjson_mut_obj_add_real(doc, root, "execution_time_ms", - output.execution_time_ms); - - yyjson_mut_val* nodes_arr = yyjson_mut_arr(doc); - for (const auto& node : output.nodes) { - yyjson_mut_arr_append(nodes_arr, build_node_json(doc, node)); - } - yyjson_mut_obj_add_val(doc, root, "nodes", nodes_arr); - - yyjson_write_err write_err = {}; - std::size_t json_len = 0; - char* json = yyjson_mut_write_opts(doc, YYJSON_WRITE_PRETTY, nullptr, - &json_len, &write_err); - if (!json) { - yyjson_mut_doc_free(doc); - throw std::runtime_error( - std::string("JSON serialization failed: ") + - (write_err.msg ? write_err.msg : "unknown error")); + std::ostringstream out; + + out << "{"; + out << "\"baseline\":\"" << escape_json_string(output.baseline_path) + << "\","; + out << "\"variant\":\"" << escape_json_string(output.variant_path) << "\","; + out << "\"baseline_meta\":"; + build_meta_json(out, output.baseline_meta); + out << ","; + out << "\"variant_meta\":"; + build_meta_json(out, output.variant_meta); + out << ","; + out << "\"execution_time_ms\":" << double_to_json(output.execution_time_ms) + << ","; + + out << "\"nodes\":["; + for (std::size_t i = 0; i < output.nodes.size(); ++i) { + if (i > 0) out << ","; + build_node_json(out, output.nodes[i]); } - std::string result(json, json_len); - free(json); // NOLINT(cppcoreguidelines-no-malloc) - yyjson_mut_doc_free(doc); + out << "]"; - return result; + out << "}"; + + return out.str(); } } // namespace dftracer::utils::utilities::composites::dft::comparator diff --git a/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp b/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp index 5a6ebee1..c586bac1 100644 --- a/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include @@ -19,6 +19,7 @@ class EventIdCollector : public reader::internal::LineProcessor { public: std::vector& events; bool trim_commas; + simdjson::dom::parser parser; explicit EventIdCollector(std::vector& event_list, bool should_trim_commas = false) @@ -29,7 +30,6 @@ class EventIdCollector : public reader::internal::LineProcessor { const char* trimmed; std::size_t trimmed_length; - // Use comma-trimming variant if requested (for JSON array format) bool valid = trim_commas ? json_trim_and_validate_with_comma( data, length, trimmed, trimmed_length) : json_trim_and_validate(data, length, trimmed, @@ -39,36 +39,32 @@ class EventIdCollector : public reader::internal::LineProcessor { co_return true; } - yyjson_doc* doc = yyjson_read(trimmed, trimmed_length, 0); - if (!doc) co_return true; + auto result = parser.parse(trimmed, trimmed_length); + if (result.error()) co_return true; - yyjson_val* root = yyjson_doc_get_root(doc); - if (!yyjson_is_obj(root)) { - yyjson_doc_free(doc); - co_return true; - } + auto root = result.value_unsafe(); + if (!root.is_object()) co_return true; EventId event; - yyjson_val* id_val = yyjson_obj_get(root, "id"); - if (id_val && yyjson_is_int(id_val)) { - event.id = yyjson_get_int(id_val); + auto id_result = root["id"].get_int64(); + if (!id_result.error()) { + event.id = id_result.value_unsafe(); } - yyjson_val* pid_val = yyjson_obj_get(root, "pid"); - if (pid_val && yyjson_is_int(pid_val)) { - event.pid = yyjson_get_int(pid_val); + auto pid_result = root["pid"].get_int64(); + if (!pid_result.error()) { + event.pid = pid_result.value_unsafe(); } - yyjson_val* tid_val = yyjson_obj_get(root, "tid"); - if (tid_val && yyjson_is_int(tid_val)) { - event.tid = yyjson_get_int(tid_val); + auto tid_result = root["tid"].get_int64(); + if (!tid_result.error()) { + event.tid = tid_result.value_unsafe(); } if (event.is_valid()) { events.push_back(event); } - yyjson_doc_free(doc); co_return true; } }; diff --git a/src/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp b/src/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp index 095022e6..b3edb434 100644 --- a/src/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include namespace dftracer::utils::utilities::composites::dft { @@ -8,37 +8,32 @@ coro::CoroTask EventIdExtractor::process( const EventIdExtractionInput& input) { EventId event; - yyjson_doc* doc = - yyjson_read(input.json_data.data(), input.json_data.size(), 0); - if (!doc) { - co_return event; // Invalid JSON + simdjson::dom::parser parser; + auto result = parser.parse(input.json_data.data(), input.json_data.size()); + if (result.error()) { + co_return event; } - yyjson_val* root = yyjson_doc_get_root(doc); - if (!yyjson_is_obj(root)) { - yyjson_doc_free(doc); - co_return event; // Not a JSON object + auto root = result.value_unsafe(); + if (!root.is_object()) { + co_return event; } - // Extract id - yyjson_val* id_val = yyjson_obj_get(root, "id"); - if (id_val && yyjson_is_int(id_val)) { - event.id = yyjson_get_int(id_val); + auto id_result = root["id"].get_int64(); + if (!id_result.error()) { + event.id = id_result.value_unsafe(); } - // Extract pid - yyjson_val* pid_val = yyjson_obj_get(root, "pid"); - if (pid_val && yyjson_is_int(pid_val)) { - event.pid = yyjson_get_int(pid_val); + auto pid_result = root["pid"].get_int64(); + if (!pid_result.error()) { + event.pid = pid_result.value_unsafe(); } - // Extract tid - yyjson_val* tid_val = yyjson_obj_get(root, "tid"); - if (tid_val && yyjson_is_int(tid_val)) { - event.tid = yyjson_get_int(tid_val); + auto tid_result = root["tid"].get_int64(); + if (!tid_result.error()) { + event.tid = tid_result.value_unsafe(); } - yyjson_doc_free(doc); co_return event; } diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.cpp index a9bdc02c..edc9c96e 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.cpp +++ b/src/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.cpp @@ -8,11 +8,23 @@ namespace dftracer::utils::utilities::composites::dft::indexing { namespace { -constexpr std::size_t HEADER_SIZE = - 12; // 4 bytes num_hashes + 4 bytes num_entries + 4 bytes num_bits +constexpr std::size_t HEADER_SIZE = 12; +constexpr std::size_t BLOCK_BYTES = 32; // 8 x u32 = 256 bits +constexpr std::size_t BLOCK_BITS = BLOCK_BYTES * 8; +constexpr std::size_t BLOCK_WORDS = BLOCK_BYTES / 4; + +// Split block Bloom filter SALT array, taken verbatim from the Apache +// Parquet spec (parquet-format/BloomFilter.md). Eight odd 32-bit +// constants; each (h2 * SALT[i]) >> 27 picks one of 32 bits in word i +// of the 256-bit block, with the 8 bit-selectors empirically +// uncorrelated. See Apple, "Split block Bloom filters", arXiv:2101.01719. +constexpr std::uint32_t SALT[BLOCK_WORDS] = { + 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, + 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U, +}; void write_u32_le(unsigned char* buf, std::uint32_t val) { - if (!buf) return; // Defensive check to silence compiler warning + if (!buf) return; buf[0] = static_cast(val & 0xFF); buf[1] = static_cast((val >> 8) & 0xFF); buf[2] = static_cast((val >> 16) & 0xFF); @@ -25,6 +37,20 @@ std::uint32_t read_u32_le(const unsigned char* buf) { (static_cast(buf[2]) << 16) | (static_cast(buf[3]) << 24); } + +inline std::size_t block_index(std::uint64_t h1, std::size_t num_blocks) { + return static_cast( + (static_cast<__uint128_t>(h1) * num_blocks) >> 64); +} + +inline void compute_block_mask(std::uint64_t h2, + std::uint32_t (&out)[BLOCK_WORDS]) { + auto h32 = static_cast(h2 ^ (h2 >> 32)); + for (std::size_t i = 0; i < BLOCK_WORDS; ++i) { + std::uint32_t y = h32 * SALT[i]; + out[i] = 1U << (y >> 27); + } +} } // namespace std::size_t BloomFilter::optimal_num_bits(std::size_t n, double p) { @@ -34,7 +60,12 @@ std::size_t BloomFilter::optimal_num_bits(std::size_t n, double p) { auto m = static_cast( std::ceil(-static_cast(n) * std::log(p) / (std::log(2.0) * std::log(2.0)))); - return std::max(m, static_cast(64)); + // Round up to a whole number of 512-bit blocks. Blocked bloom filters + // pay ~10-15% extra memory for the same FPR vs classical; bump the + // requested bit count to compensate before rounding. + m = static_cast(static_cast(m) * 1.15); + m = std::max(m, BLOCK_BITS); + return ((m + BLOCK_BITS - 1) / BLOCK_BITS) * BLOCK_BITS; } std::size_t BloomFilter::optimal_num_hashes(std::size_t m, std::size_t n) { @@ -49,8 +80,7 @@ BloomFilter::BloomFilter(std::size_t expected_entries, : num_bits_(optimal_num_bits(expected_entries, false_positive_rate)), num_hashes_(optimal_num_hashes(num_bits_, expected_entries)), num_entries_(0) { - std::size_t num_bytes = (num_bits_ + 7) / 8; - bits_.resize(num_bytes, 0); + bits_.assign(num_bits_ / 8, 0); } BloomFilter::BloomFilter(std::vector bits, std::size_t num_bits, @@ -85,41 +115,67 @@ void BloomFilter::compute_hashes(std::string_view value, std::uint64_t& h1, std::uint64_t& h2) const { hasher_.reset(); hasher_.update(value); - h1 = hasher_.get_hash().value; - // Second hash: mix with a different seed using FNV-like mixing - std::uint64_t seed = 0x517cc1b727220a95ULL; - h2 = h1 * seed + 0x9e3779b97f4a7c15ULL; - h2 ^= (h2 >> 33); - h2 *= 0xff51afd7ed558ccdULL; - h2 ^= (h2 >> 33); + std::uint64_t raw = hasher_.get_hash().value; + // FNV-1a leaves correlated high bits for similar short keys, which + // breaks Lemire reduction in the blocked path. Run a SplitMix64-style + // finisher to fully avalanche, then derive a second hash for masking. + h1 = raw; + h1 = (h1 ^ (h1 >> 30)) * 0xbf58476d1ce4e5b9ULL; + h1 = (h1 ^ (h1 >> 27)) * 0x94d049bb133111ebULL; + h1 ^= (h1 >> 31); + h2 = raw + 0x9e3779b97f4a7c15ULL; + h2 = (h2 ^ (h2 >> 30)) * 0xbf58476d1ce4e5b9ULL; + h2 = (h2 ^ (h2 >> 27)) * 0x94d049bb133111ebULL; + h2 ^= (h2 >> 31); } std::size_t BloomFilter::nth_hash(std::uint64_t h1, std::uint64_t h2, std::size_t n) const { - // Kirsch-Mitzenmacher: g_i(x) = h1(x) + i * h2(x) return static_cast((h1 + n * h2) % num_bits_); } void BloomFilter::add(std::string_view value) { + if (last_value_valid_ && value.size() == last_value_size_ && + std::memcmp(last_value_buf_.data(), value.data(), value.size()) == 0) { + ++num_entries_; + return; + } + std::uint64_t h1, h2; compute_hashes(value, h1, h2); - for (std::size_t i = 0; i < num_hashes_; ++i) { - std::size_t bit_pos = nth_hash(h1, h2, i); - bits_[bit_pos / 8] |= static_cast(1u << (bit_pos % 8)); - } + std::size_t num_blocks = num_bits_ / BLOCK_BITS; + std::size_t blk = block_index(h1, num_blocks); + auto* block = + reinterpret_cast(bits_.data() + blk * BLOCK_BYTES); + + std::uint32_t mask[BLOCK_WORDS]; + compute_block_mask(h2, mask); + for (std::size_t i = 0; i < BLOCK_WORDS; ++i) block[i] |= mask[i]; ++num_entries_; + + if (value.size() <= LAST_VALUE_CAP) { + std::memcpy(last_value_buf_.data(), value.data(), value.size()); + last_value_size_ = value.size(); + last_value_valid_ = true; + } else { + last_value_valid_ = false; + } } bool BloomFilter::possibly_contains(std::string_view value) const { std::uint64_t h1, h2; compute_hashes(value, h1, h2); - for (std::size_t i = 0; i < num_hashes_; ++i) { - std::size_t bit_pos = nth_hash(h1, h2, i); - if (!(bits_[bit_pos / 8] & (1u << (bit_pos % 8)))) { - return false; - } + std::size_t num_blocks = num_bits_ / BLOCK_BITS; + std::size_t blk = block_index(h1, num_blocks); + const auto* block = reinterpret_cast( + bits_.data() + blk * BLOCK_BYTES); + + std::uint32_t mask[BLOCK_WORDS]; + compute_block_mask(h2, mask); + for (std::size_t i = 0; i < BLOCK_WORDS; ++i) { + if ((block[i] & mask[i]) != mask[i]) return false; } return true; } @@ -131,9 +187,11 @@ void BloomFilter::merge_from(const BloomFilter& other) { "BloomFilter::merge_from: incompatible filter parameters"); } - for (std::size_t i = 0; i < bits_.size(); ++i) { - bits_[i] |= other.bits_[i]; - } + auto* dst = reinterpret_cast(bits_.data()); + const auto* src = + reinterpret_cast(other.bits_.data()); + std::size_t n = bits_.size() / 8; + for (std::size_t i = 0; i < n; ++i) dst[i] |= src[i]; num_entries_ += other.num_entries_; } diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp index 8866a804..2f436be5 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp +++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp @@ -8,48 +8,52 @@ namespace dftracer::utils::utilities::composites::dft::indexing { void ChunkDimensionStats::observe(std::string_view value) { + if (last_key_ != nullptr && *last_key_ == value) { + ++*last_counter_; + return; + } + if (!value_counts) { value_counts.emplace(); } - // NOTE(perf): transparent lookup: find with string_view, only construct - // string on insert auto it = value_counts->find(value); - bool inserted = false; - if (it == value_counts->end()) { - auto [new_it, _] = value_counts->emplace(std::string(value), 0); - it = new_it; - inserted = true; + if (it != value_counts->end()) { + it->second++; + last_key_ = &it->first; + last_counter_ = &it->second; + return; } - it->second++; - if (inserted) { - distinct_count = value_counts->size(); - } + auto [new_it, _] = value_counts->emplace(std::string(value), 1); + it = new_it; + distinct_count = value_counts->size(); + last_key_ = &it->first; + last_counter_ = &it->second; + + const std::string& val_ref = it->first; - // NOTE(perf): min/max: fast-path for uint dimensions compare as integers if (value_type == "uint") { std::uint64_t val = 0; auto [ptr, ec] = std::from_chars(value.data(), value.data() + value.size(), val); if (ec == std::errc()) { if (min_value.empty()) { - min_value = it->first; - max_value = it->first; + min_value = val_ref; + max_value = val_ref; } else { std::uint64_t cur_min = 0, cur_max = 0; std::from_chars(min_value.data(), min_value.data() + min_value.size(), cur_min); std::from_chars(max_value.data(), max_value.data() + max_value.size(), cur_max); - if (val < cur_min) min_value = it->first; - if (val > cur_max) max_value = it->first; + if (val < cur_min) min_value = val_ref; + if (val > cur_max) max_value = val_ref; } return; } } - const std::string& val_ref = it->first; if (min_value.empty() || val_ref < min_value) { min_value = val_ref; } @@ -58,6 +62,23 @@ void ChunkDimensionStats::observe(std::string_view value) { } } +void ChunkDimensionStats::observe_range_only(std::uint64_t value) { + distinct_count++; + auto str = std::to_string(value); + if (min_value.empty()) { + min_value = str; + max_value = str; + } else { + std::uint64_t cur_min = 0, cur_max = 0; + std::from_chars(min_value.data(), min_value.data() + min_value.size(), + cur_min); + std::from_chars(max_value.data(), max_value.data() + max_value.size(), + cur_max); + if (value < cur_min) min_value = str; + if (value > cur_max) max_value = str; + } +} + std::vector ChunkDimensionStats::serialize_value_counts() const { if (!value_counts || value_counts->empty()) return {}; @@ -99,23 +120,10 @@ ChunkDimensionStats::compress_value_counts(std::size_t cap_bytes) const { auto raw = serialize_value_counts(); if (raw.empty()) return std::nullopt; - // NOTE(perf): Reuse zlib stream across calls, deflateReset resets state - // without reallocating internal buffers. - struct ZlibDeflater { - z_stream strm{}; - bool init = false; - ~ZlibDeflater() { - if (init) deflateEnd(&strm); - } - }; - static thread_local ZlibDeflater zd; - if (!zd.init) { - deflateInit(&zd.strm, Z_DEFAULT_COMPRESSION); - zd.init = true; - } else { - deflateReset(&zd.strm); + z_stream strm{}; + if (deflateInit(&strm, Z_DEFAULT_COMPRESSION) != Z_OK) { + return std::nullopt; } - auto& strm = zd.strm; uLongf compressed_len = compressBound(static_cast(raw.size())); std::vector compressed(compressed_len); @@ -126,9 +134,13 @@ ChunkDimensionStats::compress_value_counts(std::size_t cap_bytes) const { strm.avail_out = static_cast(compressed_len); int rc = deflate(&strm, Z_FINISH); - if (rc != Z_STREAM_END) return std::nullopt; + if (rc != Z_STREAM_END) { + deflateEnd(&strm); + return std::nullopt; + } compressed.resize(strm.total_out); + deflateEnd(&strm); if (compressed.size() > cap_bytes) return std::nullopt; return compressed; @@ -153,10 +165,10 @@ std::uint64_t read_u64_le(const std::uint8_t* p) { } } // namespace -std::unordered_map +dftracer::utils::StringViewMap ChunkDimensionStats::deserialize_value_counts(const std::uint8_t* data, std::size_t len) { - std::unordered_map result; + dftracer::utils::StringViewMap result; if (!data || len < 4) return result; std::size_t pos = 0; @@ -182,7 +194,7 @@ ChunkDimensionStats::deserialize_value_counts(const std::uint8_t* data, return result; } -std::unordered_map +dftracer::utils::StringViewMap ChunkDimensionStats::decompress_value_counts(const std::uint8_t* data, std::size_t len) { if (!data || len == 0) return {}; diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp index 8f9ce32a..40a09c3e 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp @@ -1,19 +1,17 @@ #include -#include -#include +#include +#include #include #include #include #include -#include #include #include #include -// Import JsonValue from common json namespace -using dftracer::utils::utilities::common::json::JsonValue; -using dftracer::utils::utilities::composites::dft::DFTracerEvent; +using dftracer::utils::utilities::common::json::JsonParser; +using dftracer::utils::utilities::common::json::ondemand_value_to_string; namespace dftracer::utils::utilities::composites::dft::indexing { @@ -30,23 +28,6 @@ static const std::string DIM_CAT = "cat"; static const std::string DIM_PID = "pid"; static const std::string DIM_TID = "tid"; -// Convert a JsonValue to string for bloom filter insertion. -// Handles strings, integers, floats, bools. -std::string json_value_to_string(const JsonValue& val) { - if (val.is_string()) { - return val.get(); - } else if (val.is_uint()) { - return std::to_string(val.get()); - } else if (val.is_int()) { - return std::to_string(val.get()); - } else if (val.is_number()) { - return std::to_string(val.get()); - } else if (val.is_bool()) { - return val.get() ? "true" : "false"; - } - return {}; -} - // Build set of dimensions to index based on config std::vector get_target_dimensions( const ChunkIndexerConfig& config) { @@ -206,6 +187,19 @@ coro::CoroTask ChunkIndexerUtility::process( event_lines; std::map> metadata_lines; + // On-Demand parser for lazy field access - only parses what we use + JsonParser parser; + + // Pre-check which bloom filters we need + const bool need_name = output.bloom_filters.count(DIM_NAME) > 0; + const bool need_cat = output.bloom_filters.count(DIM_CAT) > 0; + const bool need_pid = output.bloom_filters.count(DIM_PID) > 0; + const bool need_tid = output.bloom_filters.count(DIM_TID) > 0; + const bool need_hhash = output.bloom_filters.count(DIM_HHASH) > 0; + const bool need_fhash = output.bloom_filters.count(DIM_FHASH) > 0; + const bool need_shash = output.bloom_filters.count(DIM_SHASH) > 0; + const bool has_extra_dims = !input.config.extra_dimensions.empty(); + while (!stream->done()) { auto chunk = co_await stream->read_async(); @@ -229,153 +223,209 @@ coro::CoroTask ChunkIndexerUtility::process( std::size_t line_len = newline - line_start; if (line_len > 0) { - yyjson_read_flag flg = YYJSON_READ_NOFLAG; - yyjson_doc* doc = - yyjson_read_opts(const_cast(line_start), line_len, - flg, nullptr, nullptr); - - if (doc) { - yyjson_val* root = yyjson_doc_get_root(doc); - if (root && yyjson_is_obj(root)) { - JsonValue json(root); - DFTracerEvent ev; - if (!DFTracerEvent::parse(json, ev)) { - yyjson_doc_free(doc); - pos = (newline - data) + 1; - line_number++; - continue; - } + std::string_view line_sv(line_start, line_len); + if (!parser.parse(line_sv)) { + pos = (newline - data) + 1; + line_number++; + continue; + } + + // Extract ph first to determine event type + auto ph = parser.get_string("ph"); + if (!ph) { + pos = (newline - data) + 1; + line_number++; + continue; + } - if (ev.is_metadata()) { - // Metadata event: collect hash resolutions - if (ev.args.exists()) { - std::string hash_val = - ev.args["value"].get(); - std::string resolved = - ev.args["name"].get(); - - if (!hash_val.empty() && !resolved.empty()) { - if (ev.name == "HH") { - output.hash_resolutions[DIM_HHASH] - [hash_val] = - resolved; - } else if (ev.name == "FH") { - output.hash_resolutions[DIM_FHASH] - [hash_val] = - resolved; - } else if (ev.name == "SH") { - output.hash_resolutions[DIM_SHASH] - [hash_val] = - resolved; + bool is_metadata = (*ph == "M"); + + if (is_metadata) { + // Metadata event: extract name and args in single pass + // Re-parse to get fresh document state + parser.parse(line_sv); + + std::string event_name; + std::string hash_val; + std::string resolved; + + parser.for_each_field([&](std::string_view key, + simdjson::ondemand::value val) { + if (key == "name") { + auto s = val.get_string(); + if (!s.error()) event_name = std::string(s.value()); + } else if (key == "args") { + auto obj = val.get_object(); + if (!obj.error()) { + for (auto field : obj.value()) { + if (field.error()) continue; + auto fkey = field.unescaped_key(); + if (fkey.error()) continue; + auto fval = field.value(); + if (fval.error()) continue; + + if (fkey.value() == "value") { + auto s = fval.value().get_string(); + if (!s.error()) + hash_val = std::string(s.value()); + } else if (fkey.value() == "name") { + auto s = fval.value().get_string(); + if (!s.error()) + resolved = std::string(s.value()); } } } - if (collect_manifest) { - std::string meta_type(ev.name); - metadata_lines[meta_type].push_back( - line_number); - } - } else { - // Regular event: index into bloom filters + stats - - // Update statistics (always update for accuracy) - output.statistics.update_from_event( - ev.name, ev.cat, ev.pid, ev.tid, ev.ts, ev.dur); - - // Add to bloom filters for missing dimensions only - auto it = output.bloom_filters.find(DIM_NAME); - if (it != output.bloom_filters.end() && - !ev.name.empty()) { - it->second.add(ev.name); - } + } + }); + + if (!hash_val.empty() && !resolved.empty()) { + if (event_name == "HH") { + output.hash_resolutions[DIM_HHASH][hash_val] = + resolved; + } else if (event_name == "FH") { + output.hash_resolutions[DIM_FHASH][hash_val] = + resolved; + } else if (event_name == "SH") { + output.hash_resolutions[DIM_SHASH][hash_val] = + resolved; + } + } - it = output.bloom_filters.find(DIM_CAT); - if (it != output.bloom_filters.end() && - !ev.cat.empty()) { - it->second.add(ev.cat); - } + if (collect_manifest) { + metadata_lines[event_name].push_back(line_number); + } + } else { + // Regular event: re-parse for fresh state and extract + // fields + parser.parse(line_sv); + auto name_opt = parser.get_string("name"); + std::string_view name = name_opt.value_or(""); + auto cat_opt = parser.get_string("cat"); + std::string_view cat = cat_opt.value_or(""); + + auto pid = parser.get_uint64("pid").value_or(0); + auto tid = parser.get_uint64("tid").value_or(0); + auto ts = parser.get_uint64("ts").value_or(0); + auto dur = parser.get_uint64("dur").value_or(0); + + // Update statistics + output.statistics.update_from_event(name, cat, pid, tid, ts, + dur); + + // Add to bloom filters + if (need_name && !name.empty()) { + output.bloom_filters[DIM_NAME].add(name); + } - it = output.bloom_filters.find(DIM_PID); - if (it != output.bloom_filters.end()) { - char pid_buf[32]; - int n = std::snprintf( - pid_buf, sizeof(pid_buf), "%llu", - static_cast(ev.pid)); - it->second.add(std::string_view(pid_buf, n)); - } + if (need_cat && !cat.empty()) { + output.bloom_filters[DIM_CAT].add(cat); + } - it = output.bloom_filters.find(DIM_TID); - if (it != output.bloom_filters.end()) { - char tid_buf[32]; - int n = std::snprintf( - tid_buf, sizeof(tid_buf), "%llu", - static_cast(ev.tid)); - it->second.add(std::string_view(tid_buf, n)); - } + if (need_pid) { + char pid_buf[32]; + int n = + std::snprintf(pid_buf, sizeof(pid_buf), "%llu", + static_cast(pid)); + output.bloom_filters[DIM_PID].add( + std::string_view(pid_buf, n)); + } - if (ev.args.exists()) { - // Hash dimensions: add hash to bloom - it = output.bloom_filters.find(DIM_HHASH); - if (it != output.bloom_filters.end()) { - std::string_view hhash = - ev.args["hhash"] - .get(); - if (!hhash.empty()) { - it->second.add(hhash); - } - } + if (need_tid) { + char tid_buf[32]; + int n = + std::snprintf(tid_buf, sizeof(tid_buf), "%llu", + static_cast(tid)); + output.bloom_filters[DIM_TID].add( + std::string_view(tid_buf, n)); + } - it = output.bloom_filters.find(DIM_FHASH); - if (it != output.bloom_filters.end()) { - std::string_view fhash = - ev.args["fhash"] - .get(); - if (!fhash.empty()) { - it->second.add(fhash); - } + // Process args for hash dimensions and extra dimensions + if (need_hhash || need_fhash || need_shash || + has_extra_dims) { + parser.for_each_field("args", [&](std::string_view key, + simdjson::ondemand:: + value val) { + if (need_hhash && key == "hhash") { + auto s = val.get_string(); + if (!s.error() && !s.value().empty()) { + output.bloom_filters[DIM_HHASH].add( + s.value()); } - - it = output.bloom_filters.find(DIM_SHASH); - if (it != output.bloom_filters.end()) { - // shash can be under cmd_hash or exec_hash - std::string_view shash = - ev.args["cmd_hash"] - .get(); - if (shash.empty()) { - shash = ev.args["exec_hash"] - .get(); - } - if (!shash.empty()) { - it->second.add(shash); - } + } else if (need_fhash && key == "fhash") { + auto s = val.get_string(); + if (!s.error() && !s.value().empty()) { + output.bloom_filters[DIM_FHASH].add( + s.value()); } - - // Extra dimensions: arbitrary nested dot-paths + } else if (need_shash && (key == "cmd_hash" || + key == "exec_hash")) { + auto s = val.get_string(); + if (!s.error() && !s.value().empty()) { + output.bloom_filters[DIM_SHASH].add( + s.value()); + } + } else if (has_extra_dims) { + // Check if this key matches any extra dimension for (const auto& dim : input.config.extra_dimensions) { - it = output.bloom_filters.find(dim); - if (it != output.bloom_filters.end()) { - JsonValue val = ev.args.at(dim.c_str()); - if (val.exists()) { - std::string str_val = - json_value_to_string(val); - if (!str_val.empty()) { - it->second.add(str_val); + // Check for exact match (flat key) + if (key == dim) { + std::string str_val = + ondemand_value_to_string(val); + if (!str_val.empty()) { + output.bloom_filters[dim].add( + str_val); + } + break; + } + // Check for nested key (e.g., "io.size") + auto dot_pos = dim.find('.'); + if (dot_pos != std::string::npos) { + std::string_view prefix(dim.data(), + dot_pos); + if (key == prefix) { + // Navigate into nested object + std::string_view suffix( + dim.data() + dot_pos + 1, + dim.size() - dot_pos - 1); + auto obj = val.get_object(); + if (!obj.error()) { + for (auto field : obj.value()) { + if (field.error()) continue; + auto fkey = + field.unescaped_key(); + if (fkey.error()) continue; + if (fkey.value() == + suffix) { + auto fval = + field.value(); + if (fval.error()) + continue; + std::string str_val = + ondemand_value_to_string( + fval.value()); + if (!str_val.empty()) { + output + .bloom_filters + [dim] + .add(str_val); + } + break; + } + } } } } } } + }); + } - if (collect_manifest) { - event_lines[{std::string(ev.cat), - std::string(ev.name)}] - .push_back(line_number); - } - output.events_processed++; - } + if (collect_manifest) { + event_lines[{std::string(cat), std::string(name)}] + .push_back(line_number); } - yyjson_doc_free(doc); + output.events_processed++; } } diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp index b30ecbf7..98288a06 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp @@ -1,14 +1,16 @@ #include -#include #include +#include #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -37,6 +39,14 @@ bool looks_like_hash(const std::string& value) { return true; } +std::optional dim_to_hash_type( + const std::string& dim) { + if (dim == "fhash") return IndexDatabase::HashType::FILE; + if (dim == "hhash") return IndexDatabase::HashType::HOST; + if (dim == "shash") return IndexDatabase::HashType::STRING; + return std::nullopt; +} + struct PrunerContext { int file_info_id; std::uint64_t total_chunks; @@ -45,9 +55,13 @@ struct PrunerContext { std::unordered_map> bloom_filters; + std::unordered_map + ts_histograms; // Hash resolution: human-readable value → hash strings std::unordered_map> hash_cache; + std::unordered_map> name_postings; + std::unordered_map name_file_membership; const IndexDatabase* db = nullptr; int fid = -1; @@ -67,13 +81,47 @@ struct PrunerContext { if (it != hash_cache.end()) return it->second; if (db) { - auto hashes = db->query_hash_by_resolved(dim, val); - auto& cached = hash_cache[key]; - cached = std::move(hashes); - return cached; + auto hash_type = dim_to_hash_type(dim); + if (hash_type) { + auto hash = db->resolve_name_to_hash(*hash_type, val); + auto& cached = hash_cache[key]; + if (hash) { + cached.push_back(std::move(*hash)); + } + return cached; + } } return empty; } + + const std::set& resolve_name_chunks(const std::string& val) { + static const std::set empty; + auto it = name_postings.find(val); + if (it != name_postings.end()) return it->second; + if (!db || fid < 0) return empty; + + auto chunk_ids = db->query_name_chunk_postings(val, fid); + auto& cached = name_postings[val]; + cached.insert(chunk_ids.begin(), chunk_ids.end()); + return cached; + } + + std::optional file_contains_name(const std::string& val) { + auto it = name_file_membership.find(val); + if (it != name_file_membership.end()) return it->second; + if (!db || fid < 0) return std::nullopt; + + auto name_id = db->query_name_id(val); + if (!name_id.has_value()) { + return std::nullopt; + } + + auto file_ids = db->query_name_file_postings(val); + const bool present = + std::find(file_ids.begin(), file_ids.end(), fid) != file_ids.end(); + name_file_membership[val] = present; + return present; + } }; std::string literal_to_string(const query_ns::LiteralNode& lit) { @@ -103,6 +151,8 @@ std::optional dict_contains(const ChunkMeta& meta, const std::string& dim, const std::string& val) { auto it = meta.dim_stats.find(dim); if (it == meta.dim_stats.end()) return std::nullopt; + if (!it->second.has_value_counts_payload()) return std::nullopt; + it->second.ensure_value_counts_decoded(); if (!it->second.value_counts) return std::nullopt; return it->second.value_counts->count(val) > 0; } @@ -113,6 +163,8 @@ std::optional dict_excludes(const ChunkMeta& meta, const std::string& dim, const std::string& val) { auto it = meta.dim_stats.find(dim); if (it == meta.dim_stats.end()) return std::nullopt; + if (!it->second.has_value_counts_payload()) return std::nullopt; + it->second.ensure_value_counts_decoded(); if (!it->second.value_counts) return std::nullopt; auto& vc = *it->second.value_counts; // If the only value in the chunk IS val, all events match val @@ -129,13 +181,26 @@ int compare_values(const std::string& a, const std::string& b, const std::string& vtype) { if (is_numeric_type(vtype)) { try { + if (vtype == "uint") { + auto ua = std::stoull(a); + auto ub = std::stoull(b); + if (ua < ub) return -1; + if (ua > ub) return 1; + return 0; + } + if (vtype == "int") { + auto ia = std::stoll(a); + auto ib = std::stoll(b); + if (ia < ib) return -1; + if (ia > ib) return 1; + return 0; + } double da = std::stod(a); double db = std::stod(b); if (da < db) return -1; if (da > db) return 1; return 0; } catch (...) { - // Fall through to string comparison } } if (a < b) return -1; @@ -187,6 +252,28 @@ bool bloom_may_contain(PrunerContext& ctx, const std::string& dim, return bloom_probe(ctx, dim, ckpt, val); } +// Tier 4: Timestamp histogram check -- zero events in range means skip +bool histogram_has_events(PrunerContext& ctx, std::uint64_t ckpt, + query_ns::CompareOp op, std::uint64_t ts_val) { + auto it = ctx.ts_histograms.find(ckpt); + if (it == ctx.ts_histograms.end()) return true; + const auto& hist = it->second; + if (hist.empty()) return true; + + switch (op) { + case query_ns::CompareOp::GT: + return hist.count_in_range(ts_val + 1, UINT64_MAX) > 0; + case query_ns::CompareOp::GE: + return hist.count_in_range(ts_val, UINT64_MAX) > 0; + case query_ns::CompareOp::LT: + return hist.count_in_range(0, ts_val) > 0; + case query_ns::CompareOp::LE: + return hist.count_in_range(0, ts_val + 1) > 0; + default: + return true; + } +} + // Recursive AST evaluation: returns candidate chunk set std::set evaluate_node(const query_ns::QueryNode& node, PrunerContext& ctx); @@ -196,6 +283,19 @@ std::set eval_compare(const query_ns::CompareNode& n, std::set result; auto val_str = literal_to_string(n.value); + if (n.field.path == "name" && n.op == query_ns::CompareOp::EQ) { + auto contains = ctx.file_contains_name(val_str); + if (contains.has_value()) { + if (!*contains) { + return result; + } + auto exact_chunks = ctx.resolve_name_chunks(val_str); + if (!exact_chunks.empty()) { + return exact_chunks; + } + } + } + for (auto ckpt : ctx.all_chunks) { auto chunk_it = ctx.chunks.find(ckpt); ChunkMeta empty_meta; @@ -220,8 +320,17 @@ std::set eval_compare(const query_ns::CompareNode& n, result.insert(ckpt); } else { // Range operators: Tier 2 - if (range_may_match(meta, n.field.path, n.op, val_str)) - result.insert(ckpt); + if (!range_may_match(meta, n.field.path, n.op, val_str)) continue; + // Tier 4: histogram for ts queries + if (n.field.path == "ts") { + try { + auto ts_val = std::stoull(val_str); + if (!histogram_has_events(ctx, ckpt, n.op, ts_val)) + continue; + } catch (...) { + } + } + result.insert(ckpt); } } return result; @@ -229,6 +338,26 @@ std::set eval_compare(const query_ns::CompareNode& n, std::set eval_in(const query_ns::InNode& n, PrunerContext& ctx) { std::set result; + if (n.field.path == "name") { + for (const auto& elem : n.values.elements) { + auto val_str = literal_to_string(elem); + auto contains = ctx.file_contains_name(val_str); + if (contains.has_value()) { + if (!*contains) { + continue; + } + auto exact_chunks = ctx.resolve_name_chunks(val_str); + if (!exact_chunks.empty()) { + result.insert(exact_chunks.begin(), exact_chunks.end()); + continue; + } + } + } + if (!result.empty()) { + return result; + } + } + for (auto ckpt : ctx.all_chunks) { auto chunk_it = ctx.chunks.find(ckpt); ChunkMeta empty_meta; @@ -272,11 +401,17 @@ std::set eval_not_in(const query_ns::NotInNode& n, auto& meta = chunk_it->second; auto dim_it = meta.dim_stats.find(n.field.path); - if (dim_it == meta.dim_stats.end() || !dim_it->second.value_counts) { + if (dim_it == meta.dim_stats.end() || + !dim_it->second.has_value_counts_payload()) { // No dictionary — cannot safely skip result.insert(ckpt); continue; } + dim_it->second.ensure_value_counts_decoded(); + if (!dim_it->second.value_counts) { + result.insert(ckpt); + continue; + } auto& vc = *dim_it->second.value_counts; bool all_excluded = true; @@ -347,7 +482,15 @@ coro::CoroTask ChunkPrunerUtility::process( out.file_may_match = false; try { - IndexDatabase idx_db(input.index_path); + std::optional owned_db; + IndexDatabase* db_ptr = input.external_db; + if (!db_ptr) { + owned_db.emplace(input.index_path, + dftracer::utils::rocksdb::RocksDatabase:: + OpenMode::ReadOnly); + db_ptr = &*owned_db; + } + IndexDatabase& idx_db = *db_ptr; int fid = idx_db.get_file_info_id(get_logical_path(input.file_path)); if (fid < 0) { @@ -371,6 +514,15 @@ coro::CoroTask ChunkPrunerUtility::process( ctx.chunks[ds.checkpoint_idx].dim_stats[ds.dimension] = ds; } + // Load timestamp histograms for Tier 4 pruning + auto chunk_stats = idx_db.query_chunk_statistics(fid); + for (auto& row : chunk_stats) { + if (!row.stats.timestamp_histogram.empty()) { + ctx.ts_histograms[row.checkpoint_idx] = + std::move(row.stats.timestamp_histogram); + } + } + // Load bloom filters for all dimensions auto indexed_dims = idx_db.query_index_dimensions(fid); auto all_chunk_blooms = @@ -426,7 +578,132 @@ coro::CoroTask ChunkPrunerUtility::process( return out; }; - co_return co_await rocksdb::run(do_query); + co_return do_query(); +} + +ChunkPrunerBatchOutput ChunkPrunerUtility::process_batch( + const ChunkPrunerBatchInput& input) { + ChunkPrunerBatchOutput batch_out; + batch_out.outputs.resize(input.items.size()); + batch_out.success = false; + + try { + std::optional owned_db; + IndexDatabase* db_ptr = input.external_db; + if (!db_ptr) { + owned_db.emplace( + input.index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + db_ptr = &*owned_db; + } + IndexDatabase& idx_db = *db_ptr; + + // Collect all (item_idx -> fid) mappings up front. + std::vector fids; + fids.reserve(input.items.size()); + std::vector item_to_fid(input.items.size(), -1); + for (std::size_t i = 0; i < input.items.size(); ++i) { + int fid = idx_db.get_file_info_id( + get_logical_path(input.items[i].file_path)); + item_to_fid[i] = fid; + if (fid >= 0) fids.push_back(fid); + } + + // Batch-load per-fid dim_stats and chunk_statistics with one + // RocksDB column-family scan each instead of N scans. + auto all_dim_stats = idx_db.query_chunk_dimension_stats_batch(fids); + auto all_chunk_stats = idx_db.query_chunk_statistics_batch(fids); + + // Per-file eval: blooms / index_dimensions queries still happen + // per file but against the shared DB handle. + for (std::size_t i = 0; i < input.items.size(); ++i) { + const auto& item = input.items[i]; + auto& out = batch_out.outputs[i]; + out.success = false; + out.file_may_match = false; + + int fid = item_to_fid[i]; + if (fid < 0) { + out.success = true; + out.file_may_match = true; + continue; + } + + try { + PrunerContext ctx; + ctx.file_info_id = fid; + ctx.cache = input.cache; + ctx.index_path = input.index_path; + ctx.db = &idx_db; + ctx.fid = fid; + + auto dim_it = all_dim_stats.find(fid); + if (dim_it != all_dim_stats.end()) { + for (const auto& ds : dim_it->second) { + ctx.all_chunks.insert(ds.checkpoint_idx); + ctx.chunks[ds.checkpoint_idx].dim_stats[ds.dimension] = + ds; + } + } + + auto cs_it = all_chunk_stats.find(fid); + if (cs_it != all_chunk_stats.end()) { + for (auto& row : cs_it->second) { + if (!row.stats.timestamp_histogram.empty()) { + ctx.ts_histograms[row.checkpoint_idx] = + std::move(row.stats.timestamp_histogram); + } + } + } + + auto indexed_dims = idx_db.query_index_dimensions(fid); + auto all_chunk_blooms = + idx_db.query_chunk_bloom_filters_batch(fid, indexed_dims); + for (const auto& [dim, chunk_blooms] : all_chunk_blooms) { + for (const auto& cb : chunk_blooms) { + ctx.all_chunks.insert(cb.checkpoint_idx); + BloomFilter bf = BloomFilter::from_blob( + cb.bloom_data.data(), cb.bloom_data.size()); + if (input.cache) { + input.cache->put(input.index_path, dim, + cb.checkpoint_idx, bf); + } + ctx.bloom_filters[dim][cb.checkpoint_idx] = + std::move(bf); + } + } + + ctx.total_chunks = + ctx.all_chunks.empty() ? 0 : *ctx.all_chunks.rbegin() + 1; + out.total_checkpoints = ctx.total_chunks; + + if (ctx.all_chunks.empty()) { + out.file_may_match = true; + out.success = true; + continue; + } + + auto candidates = evaluate_node(item.query.root(), ctx); + out.candidate_checkpoints.assign(candidates.begin(), + candidates.end()); + out.file_may_match = !out.candidate_checkpoints.empty(); + out.success = true; + } catch (const std::exception& e) { + DFTRACER_UTILS_LOG_WARN( + "ChunkPruner: error for %s: %s, assuming match", + item.file_path.c_str(), e.what()); + out.file_may_match = true; + out.success = true; + } + } + + batch_out.success = true; + } catch (const std::exception& e) { + DFTRACER_UTILS_LOG_WARN("ChunkPruner: batch error for index %s: %s", + input.index_path.c_str(), e.what()); + } + + return batch_out; } } // namespace dftracer::utils::utilities::composites::dft::indexing diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp index 6aa594a9..2c9ad274 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp +++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp @@ -1,11 +1,13 @@ #include -#include +#include #include #include #include #include +#include #include +#include #include namespace dftracer::utils::utilities::composites::dft::indexing { @@ -30,9 +32,19 @@ void ChunkStatistics::update_from_event(std::string_view name, } std::string_view pt_sv(pt_buf, tp - pt_buf); - category_counts[std::string(cat)]++; - name_counts[std::string(name)]++; - pid_tid_counts[std::string(pt_sv)]++; + // Increment counts with a single lookup — allocate a string only on + // first observation. + auto bump = [](auto& map, std::string_view key) { + auto it = map.find(key); + if (it == map.end()) { + map.emplace(std::string(key), 1); + } else { + it->second++; + } + }; + bump(category_counts, cat); + bump(name_counts, name); + bump(pid_tid_counts, pt_sv); if (ts < min_timestamp_us) min_timestamp_us = ts; std::uint64_t end_ts = ts + dur; @@ -58,9 +70,16 @@ void ChunkStatistics::update_from_event(std::string_view name, double dur_d = static_cast(dur); duration_sketch.add(dur_d); duration_histogram.add(dur); - - auto [sketch_it, _3] = - name_duration_sketches.try_emplace(std::string(name)); + timestamp_histogram.add(ts); + + // name_duration_sketches: transparent find, allocate only on first + // observation, then reuse the interned key for the other name_*_ maps. + auto sketch_it = name_duration_sketches.find(name); + if (sketch_it == name_duration_sketches.end()) { + auto [new_it, _] = name_duration_sketches.emplace( + std::string(name), common::statistics::DDSketch{}); + sketch_it = new_it; + } sketch_it->second.add(dur_d); const std::string& name_key = sketch_it->first; @@ -105,6 +124,7 @@ void ChunkStatistics::merge_from(const ChunkStatistics& other) { duration_sketch.merge(other.duration_sketch); duration_histogram.merge(other.duration_histogram); + timestamp_histogram.merge(other.timestamp_histogram); for (const auto& [k, v] : other.name_duration_sketches) { name_duration_sketches[k].merge(v); @@ -135,82 +155,66 @@ double ChunkStatistics::duration_variance() const { } std::string ChunkStatistics::name_category_json() const { - yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr); - yyjson_mut_val* root = yyjson_mut_obj(doc); - yyjson_mut_doc_set_root(doc, root); - + std::ostringstream ss; + ss << '{'; + bool first = true; for (const auto& [key, value] : name_category) { - yyjson_mut_obj_add_str(doc, root, key.c_str(), value.c_str()); + if (!first) ss << ','; + first = false; + ss << '"' << key << "\":\"" << value << '"'; } - - char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_NOFLAG, nullptr); - std::string result(json_str ? json_str : "{}"); - if (json_str) free(json_str); - yyjson_mut_doc_free(doc); - return result; + ss << '}'; + return ss.str(); } -std::unordered_map -ChunkStatistics::parse_string_map_json(const std::string& json) { - std::unordered_map result; +StringViewMap ChunkStatistics::parse_string_map_json( + const std::string& json) { + StringViewMap result; - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - if (!doc) return result; + simdjson::dom::parser parser; + auto parse_result = parser.parse(json.data(), json.size()); + if (parse_result.error()) return result; - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); - return result; - } + auto root = parse_result.value_unsafe(); + if (!root.is_object()) return result; - yyjson_obj_iter iter; - yyjson_obj_iter_init(root, &iter); - yyjson_val* key; - while ((key = yyjson_obj_iter_next(&iter))) { - yyjson_val* val = yyjson_obj_iter_get_val(key); - if (yyjson_is_str(val)) { - result[yyjson_get_str(key)] = yyjson_get_str(val); + auto obj = root.get_object().value_unsafe(); + for (auto field : obj) { + auto val_result = field.value.get_string(); + if (!val_result.error()) { + result[std::string(field.key)] = + std::string(val_result.value_unsafe()); } } - - yyjson_doc_free(doc); return result; } std::string ChunkStatistics::name_duration_histograms_json() const { - yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr); - yyjson_mut_val* root = yyjson_mut_obj(doc); - yyjson_mut_doc_set_root(doc, root); - + std::ostringstream ss; + ss << '{'; + bool first = true; for (const auto& [key, hist] : name_duration_histograms) { - yyjson_mut_val* arr = hist.to_yyjson(doc); - yyjson_mut_obj_add_val(doc, root, key.c_str(), arr); + if (!first) ss << ','; + first = false; + ss << '"' << key << "\":" << hist.to_json(); } - - char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_NOFLAG, nullptr); - std::string result(json_str ? json_str : "{}"); - if (json_str) free(json_str); - yyjson_mut_doc_free(doc); - return result; + ss << '}'; + return ss.str(); } namespace { -std::string double_map_to_json( - const std::unordered_map& map) { - yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr); - yyjson_mut_val* root = yyjson_mut_obj(doc); - yyjson_mut_doc_set_root(doc, root); - +template +std::string double_map_to_json(const Map& map) { + std::ostringstream ss; + ss << std::setprecision(17) << '{'; + bool first = true; for (const auto& [key, value] : map) { - yyjson_mut_obj_add_real(doc, root, key.c_str(), value); + if (!first) ss << ','; + first = false; + ss << '"' << key << "\":" << value; } - - char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_NOFLAG, nullptr); - std::string result(json_str ? json_str : "{}"); - if (json_str) free(json_str); - yyjson_mut_doc_free(doc); - return result; + ss << '}'; + return ss.str(); } } // namespace @@ -258,89 +262,82 @@ std::vector ChunkStatistics::serialize_name_duration_sketches() return buf; } -std::unordered_map ChunkStatistics::parse_double_map_json( +StringViewMap ChunkStatistics::parse_double_map_json( const std::string& json) { - std::unordered_map result; - - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - if (!doc) return result; - - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); - return result; - } - - yyjson_obj_iter iter; - yyjson_obj_iter_init(root, &iter); - yyjson_val* key; - while ((key = yyjson_obj_iter_next(&iter))) { - yyjson_val* val = yyjson_obj_iter_get_val(key); - if (yyjson_is_real(val)) { - result[yyjson_get_str(key)] = yyjson_get_real(val); - } else if (yyjson_is_int(val)) { - result[yyjson_get_str(key)] = - static_cast(yyjson_get_int(val)); - } else if (yyjson_is_uint(val)) { - result[yyjson_get_str(key)] = - static_cast(yyjson_get_uint(val)); + StringViewMap result; + + simdjson::dom::parser parser; + auto parse_result = parser.parse(json.data(), json.size()); + if (parse_result.error()) return result; + + auto root = parse_result.value_unsafe(); + if (!root.is_object()) return result; + + auto obj = root.get_object().value_unsafe(); + for (auto field : obj) { + auto double_result = field.value.get_double(); + if (!double_result.error()) { + result[std::string(field.key)] = double_result.value_unsafe(); + } else { + auto int_result = field.value.get_int64(); + if (!int_result.error()) { + result[std::string(field.key)] = + static_cast(int_result.value_unsafe()); + } else { + auto uint_result = field.value.get_uint64(); + if (!uint_result.error()) { + result[std::string(field.key)] = + static_cast(uint_result.value_unsafe()); + } + } } } - - yyjson_doc_free(doc); return result; } -std::unordered_map +StringViewMap ChunkStatistics::parse_histogram_map_json(const std::string& json) { - std::unordered_map result; + StringViewMap result; - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - if (!doc) return result; + simdjson::dom::parser parser; + auto parse_result = parser.parse(json.data(), json.size()); + if (parse_result.error()) return result; - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); - return result; - } + auto root = parse_result.value_unsafe(); + if (!root.is_object()) return result; - yyjson_obj_iter iter; - yyjson_obj_iter_init(root, &iter); - yyjson_val* key; - while ((key = yyjson_obj_iter_next(&iter))) { - yyjson_val* val = yyjson_obj_iter_get_val(key); - if (!yyjson_is_arr(val)) continue; + auto obj = root.get_object().value_unsafe(); + for (auto field : obj) { + if (!field.value.is_array()) continue; common::statistics::Log2Histogram hist; - std::size_t idx, max; - yyjson_val* pair; - yyjson_arr_foreach(val, idx, max, pair) { - if (!yyjson_is_arr(pair) || yyjson_arr_size(pair) != 2) continue; - yyjson_val* bin_idx_val = yyjson_arr_get(pair, 0); - yyjson_val* count_val = yyjson_arr_get(pair, 1); - if (!yyjson_is_uint(bin_idx_val) || !yyjson_is_uint(count_val)) - continue; + auto arr = field.value.get_array().value_unsafe(); + for (auto pair : arr) { + if (!pair.is_array()) continue; + auto pair_arr = pair.get_array().value_unsafe(); + if (pair_arr.size() != 2) continue; + + auto bin_idx_result = pair_arr.at(0).get_uint64(); + auto count_result = pair_arr.at(1).get_uint64(); + if (bin_idx_result.error() || count_result.error()) continue; + auto bin_idx = - static_cast(yyjson_get_uint(bin_idx_val)); - auto count = yyjson_get_uint(count_val); + static_cast(bin_idx_result.value_unsafe()); + auto count = count_result.value_unsafe(); if (bin_idx < common::statistics::Log2Histogram::NUM_BINS) { hist.add(common::statistics::Log2Histogram::bin_lower(bin_idx), count); } } - result[yyjson_get_str(key)] = std::move(hist); + result[std::string(field.key)] = std::move(hist); } - - yyjson_doc_free(doc); return result; } -std::unordered_map +StringViewMap ChunkStatistics::deserialize_name_duration_sketches(const std::uint8_t* data, std::size_t len) { - std::unordered_map result; + StringViewMap result; if (!data || len < sizeof(std::uint32_t)) return result; const std::uint8_t* p = data; diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp new file mode 100644 index 00000000..2cad9e6b --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp @@ -0,0 +1,324 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace dftracer::utils::utilities::composites::dft::indexing { + +namespace { + +namespace rcf = dftracer::utils::rocksdb::cf; + +using aggregators::AGG_FILE_KEY_LEN; +using aggregators::AGG_FILE_KEY_PREFIX; +using aggregators::AGG_GLOBAL_CONFIG_KEY; +using aggregators::deserialize_agg_global_config; +using indexer::has_capability; +using indexer::IndexDatabase; +using indexer::IndexFileEntryCapability; + +struct PendingFile { + std::size_t file_index; + std::string file_path; + std::string logical_path; +}; + +struct ResolveGroupInput { + std::string index_path; + std::vector files; + bool require_checkpoints; + bool require_bloom; + bool require_manifest; + bool require_aggregation; + std::optional aggregation_config; +}; + +struct ResolveGroupOutput { + std::vector needs_checkpoint; + std::vector needs_bloom; + std::vector needs_manifest; + std::vector needs_aggregation; + std::vector cached; + bool success = true; + std::string error_message; + + // Aggregation augmentation info + bool needs_augmentation = false; + std::uint64_t stored_time_interval_us = 0; +}; + +ResolveGroupOutput resolve_group_sync(ResolveGroupInput input) { + ResolveGroupOutput result; + + if (input.index_path.empty() || !fs::exists(input.index_path)) { + for (auto& f : input.files) { + result.needs_checkpoint.push_back( + FileWorkItem{f.file_index, std::move(f.file_path), -1}); + } + return result; + } + + try { + IndexDatabase db( + input.index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto registry = db.query_all_file_registry(); + + // Check global aggregation config first + bool agg_config_compatible = false; + if (input.require_aggregation && input.aggregation_config) { + std::string global_config_val; + auto status = + db.db()->get(std::string_view(AGG_GLOBAL_CONFIG_KEY, 2), + &global_config_val, rcf::AGGREGATION); + if (status.ok() && !global_config_val.empty()) { + auto global_cfg = + deserialize_agg_global_config(global_config_val); + // config_hash == 0 means "any config is compatible" + // Otherwise recompute hash with stored time_interval to check + bool hashes_match = global_cfg.config_hash == 0; + if (!hashes_match) { + auto check_config = *input.aggregation_config; + check_config.time_interval_us = global_cfg.time_interval_us; + hashes_match = + check_config.compute_hash() == global_cfg.config_hash; + } + if (hashes_match) { + agg_config_compatible = true; + result.stored_time_interval_us = + global_cfg.time_interval_us; + if (global_cfg.time_interval_us != + input.aggregation_config->time_interval_us) { + result.needs_augmentation = true; + } + } + } + } + + // Build set of file_ids with aggregation data (key existence = cached) + std::unordered_set agg_cached_file_ids; + if (input.require_aggregation && agg_config_compatible) { + auto iter = db.db()->new_iterator(rcf::AGGREGATION); + if (iter) { + iter->Seek(AGG_FILE_KEY_PREFIX); + while (iter->Valid()) { + auto key = iter->key(); + if (key.size() < AGG_FILE_KEY_LEN || + key[0] != AGG_FILE_KEY_PREFIX[0] || + key[1] != AGG_FILE_KEY_PREFIX[1]) { + break; + } + std::int32_t file_id = + (static_cast( + static_cast(key[2])) + << 24) | + (static_cast( + static_cast(key[3])) + << 16) | + (static_cast( + static_cast(key[4])) + << 8) | + static_cast( + static_cast(key[5])); + agg_cached_file_ids.insert(file_id); + iter->Next(); + } + } + } + + for (auto& f : input.files) { + auto reg_it = registry.find(f.logical_path); + if (reg_it == registry.end()) { + result.needs_checkpoint.push_back( + FileWorkItem{f.file_index, std::move(f.file_path), -1}); + continue; + } + + const auto& reg = reg_it->second; + auto caps = reg.capabilities; + bool has_checkpoints = + has_capability(caps, IndexFileEntryCapability::CHECKPOINTS) || + has_capability(caps, IndexFileEntryCapability::FILE_SUMMARY); + bool has_bloom = + has_capability(caps, IndexFileEntryCapability::BLOOM); + bool has_manifest = + has_capability(caps, IndexFileEntryCapability::MANIFEST); + + if (input.require_checkpoints && !has_checkpoints) { + result.needs_checkpoint.push_back(FileWorkItem{ + f.file_index, std::move(f.file_path), reg.file_id}); + continue; + } + + if (input.require_bloom && !has_bloom) { + result.needs_bloom.push_back(FileWorkItem{ + f.file_index, std::move(f.file_path), reg.file_id}); + continue; + } + + if (input.require_manifest && !has_manifest) { + result.needs_manifest.push_back(FileWorkItem{ + f.file_index, std::move(f.file_path), reg.file_id}); + continue; + } + + if (input.require_aggregation && + agg_cached_file_ids.find(reg.file_id) == + agg_cached_file_ids.end()) { + result.needs_aggregation.push_back(FileWorkItem{ + f.file_index, std::move(f.file_path), reg.file_id}); + continue; + } + + result.cached.push_back(ResolvedFile{ + f.file_index, std::move(f.file_path), reg.file_id, caps}); + } + + result.success = true; + } catch (const std::exception& e) { + result.success = false; + result.error_message = e.what(); + for (auto& f : input.files) { + result.needs_checkpoint.push_back( + FileWorkItem{f.file_index, std::move(f.file_path), -1}); + } + } + + return result; +} + +} // namespace + +coro::CoroTask IndexResolverUtility::process( + const ResolverInput& input) { + ResolverResult result; + + if (!input.directory.empty()) { + filesystem::PatternDirectoryScannerUtilityInput scan_input{ + input.directory, {".pfw", ".pfw.gz"}, false}; + std::vector matched; + if (this->has_context()) { + matched = co_await this->context().spawn(scanner_, scan_input); + } else { + matched = co_await scanner_.process(scan_input); + } + result.all_files.reserve(matched.size()); + result.all_file_sizes.reserve(matched.size()); + for (const auto& entry : matched) { + result.all_files.push_back(entry.path.string()); + result.all_file_sizes.push_back(entry.size); + } + } else { + result.all_files = input.files; + result.all_file_sizes.assign(input.files.size(), 0); + for (std::size_t i = 0; i < input.files.size(); ++i) { + std::error_code ec; + auto sz = fs::file_size(input.files[i], ec); + if (!ec) result.all_file_sizes[i] = static_cast(sz); + } + } + + if (result.all_files.empty()) { + co_return result; + } + + result.index_path = internal::determine_index_path(result.all_files.front(), + input.index_dir); + + // Group files by index path and prepare for resolution + std::unordered_map> groups; + for (std::size_t i = 0; i < result.all_files.size(); ++i) { + const auto& file_path = result.all_files[i]; + auto idx_path = + internal::determine_index_path(file_path, input.index_dir); + auto logical = indexer::internal::get_logical_path(file_path); + groups[idx_path].push_back( + PendingFile{i, file_path, std::move(logical)}); + } + + std::vector outputs; + outputs.reserve(groups.size()); + + if (this->has_context() && groups.size() > 1) { + std::vector> futures; + futures.reserve(groups.size()); + + for (auto& [idx_path, files] : groups) { + ResolveGroupInput group_input; + group_input.index_path = idx_path; + group_input.files = std::move(files); + group_input.require_checkpoints = input.require_checkpoints; + group_input.require_bloom = input.require_bloom; + group_input.require_manifest = input.require_manifest; + group_input.require_aggregation = input.require_aggregation; + group_input.aggregation_config = input.aggregation_config; + + futures.push_back(this->context().spawn( + [gi = std::move(group_input)]( + CoroScope&) mutable -> coro::CoroTask { + co_return resolve_group_sync(std::move(gi)); + })); + } + + for (auto& f : futures) { + outputs.push_back(co_await f); + } + } else { + for (auto& [idx_path, files] : groups) { + ResolveGroupInput group_input; + group_input.index_path = idx_path; + group_input.files = std::move(files); + group_input.require_checkpoints = input.require_checkpoints; + group_input.require_bloom = input.require_bloom; + group_input.require_manifest = input.require_manifest; + group_input.require_aggregation = input.require_aggregation; + group_input.aggregation_config = input.aggregation_config; + + outputs.push_back(resolve_group_sync(std::move(group_input))); + } + } + + // Merge results + for (auto& out : outputs) { + for (auto& item : out.needs_checkpoint) { + result.needs_checkpoint.push_back(std::move(item)); + } + for (auto& item : out.needs_bloom) { + result.needs_bloom.push_back(std::move(item)); + } + for (auto& item : out.needs_manifest) { + result.needs_manifest.push_back(std::move(item)); + } + for (auto& item : out.needs_aggregation) { + result.needs_aggregation.push_back(std::move(item)); + } + for (auto& item : out.cached) { + result.cached.push_back(std::move(item)); + } + // Merge augmentation info (all groups should have same global config) + if (out.needs_augmentation) { + result.needs_augmentation = true; + result.stored_time_interval_us = out.stored_time_interval_us; + } + } + + DFTRACER_UTILS_LOG_INFO( + "Resolver: %zu total, %zu cached, %zu need checkpoint, %zu need bloom, " + "%zu need manifest, %zu need aggregation", + result.all_files.size(), result.cached.size(), + result.needs_checkpoint.size(), result.needs_bloom.size(), + result.needs_manifest.size(), result.needs_aggregation.size()); + + co_return result; +} + +} // namespace dftracer::utils::utilities::composites::dft::indexing diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp new file mode 100644 index 00000000..fb8e65a3 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace dftracer::utils::utilities::composites::dft::indexing { + +using aggregators::AGG_GLOBAL_CONFIG_KEY; +using aggregators::AggGlobalConfig; +using aggregators::AggregationVisitor; +using aggregators::EventAggregator; +using aggregators::serialize_agg_global_config; +using indexer::internal::get_logical_path; + +coro::CoroTask resolve_and_build_index( + CoroScope* scope, ResolveAndBuildInput input) { + // Determine parallelism + std::size_t parallelism = input.parallelism; + if (parallelism == 0) { + parallelism = dftracer_utils_hardware_concurrency(); + } + + // Initial resolve + IndexResolverUtility resolver; + ResolverInput resolve_input; + resolve_input.directory = std::move(input.directory); + resolve_input.files = std::move(input.files); + resolve_input.index_dir = input.index_dir; + resolve_input.require_checkpoints = input.require_checkpoints; + resolve_input.require_bloom = input.require_bloom; + resolve_input.require_manifest = input.require_manifest; + resolve_input.require_aggregation = input.require_aggregation; + resolve_input.aggregation_config = input.aggregation_config; + + auto result = co_await resolver.process(resolve_input); + + if (result.all_files.empty()) { + co_return result; + } + + // Collect files that need work (checkpoint or aggregation) + // When force_rebuild is set, process all files + std::vector files_needing_work; + if (input.force_rebuild) { + // all_files is already a vector of strings + files_needing_work = result.all_files; + } else { + std::set files_needing_work_set; + for (const auto& item : result.needs_checkpoint) { + files_needing_work_set.insert(item.file_path); + } + for (const auto& item : result.needs_aggregation) { + files_needing_work_set.insert(item.file_path); + } + files_needing_work.assign(files_needing_work_set.begin(), + files_needing_work_set.end()); + } + + if (!files_needing_work.empty()) { + DFTRACER_UTILS_LOG_INFO( + "Building index for %zu files (checkpoint: %zu, aggregation: %zu)", + files_needing_work.size(), result.needs_checkpoint.size(), + result.needs_aggregation.size()); + + // Set up aggregation components if needed + std::shared_ptr agg_db; + std::unique_ptr merger; + std::shared_ptr agg_config_ptr; + + if (input.require_aggregation && input.aggregation_config) { + agg_db = + EventAggregator::open_with_merge_operator(result.index_path); + merger = std::make_unique(agg_db, 0); + agg_config_ptr = std::make_shared( + *input.aggregation_config); + } + + auto batch_config = std::make_shared(); + batch_config->file_paths = std::move(files_needing_work); + batch_config->index_dir = input.index_dir; + batch_config->checkpoint_size = input.checkpoint_size; + batch_config->parallelism = parallelism; + batch_config->force_rebuild = input.force_rebuild; + batch_config->build_manifest = input.require_manifest; + batch_config->use_batch_write = true; + batch_config->rebuild_root_summaries = true; + + // Attach AggregationVisitor if aggregation is required + if (agg_db && agg_config_ptr) { + batch_config->dft_visitor_factory = + [agg_db, agg_config_ptr](const std::string& file_path) + -> std::vector< + std::unique_ptr> { + std::vector> + visitors; + visitors.push_back(std::make_unique( + agg_db, 0, *agg_config_ptr, file_path)); + return visitors; + }; + } + + auto batch_result = co_await indexer::IndexBatchBuilderUtility::process( + scope, std::move(batch_config)); + + // Drain visitors and merge aggregation results + std::vector processed_files; + if (merger) { + for (auto& file_visitors : batch_result.extra_visitors) { + for (auto& visitor : file_visitors) { + auto* agg_visitor = + dynamic_cast(visitor.get()); + if (agg_visitor) { + for (const auto& k : agg_visitor->observed_extra_keys()) + merger->add_observed_extra_key(k); + for (const auto& m : + agg_visitor->observed_custom_metrics()) + merger->add_observed_custom_metric(m); + auto output = agg_visitor->take_output(); + processed_files.push_back(output.file_path); + merger->merge_chunk(std::move(output)); + } + } + file_visitors.clear(); + } + + // Write global config and per-file markers + if (!processed_files.empty()) { + namespace rcf = dftracer::utils::rocksdb::cf; + indexer::IndexDatabase idx_db( + result.index_path, dftracer::utils::rocksdb::RocksDatabase:: + OpenMode::ReadOnly); + + auto batch = agg_db->begin_batch(); + + // Write global config (0xFFFE key) + AggGlobalConfig global_cfg; + global_cfg.time_interval_us = + input.aggregation_config->time_interval_us; + global_cfg.config_hash = 0; + agg_db->put(batch, rcf::AGGREGATION, + std::string_view(AGG_GLOBAL_CONFIG_KEY, 2), + serialize_agg_global_config(global_cfg)); + + // Write per-file markers + for (const auto& file_path : processed_files) { + int file_id = + idx_db.get_file_info_id(get_logical_path(file_path)); + if (file_id >= 0) { + char marker_key[6]; + marker_key[0] = '\xFF'; + marker_key[1] = '\xFF'; + auto fid_u32 = static_cast(file_id); + std::uint32_t fid_be = __builtin_bswap32(fid_u32); + std::memcpy(&marker_key[2], &fid_be, 4); + agg_db->put(batch, rcf::AGGREGATION, + std::string_view(marker_key, 6), + std::string_view()); + } + } + + agg_db->commit_batch(batch); + + // Compact aggregation CFs so all Merge entries become Puts. + // This allows concurrent ReadOnly access without merge + // operators. + agg_db->compact(rcf::AGGREGATION); + agg_db->compact(rcf::SYSTEM_METRICS); + } + } + + // Re-resolve newly built files to get file_ids + ResolverInput refresh_input; + refresh_input.files.reserve(result.needs_checkpoint.size()); + for (const auto& item : result.needs_checkpoint) { + refresh_input.files.push_back(item.file_path); + } + refresh_input.index_dir = input.index_dir; + refresh_input.require_checkpoints = true; + + if (!refresh_input.files.empty()) { + auto refresh_result = co_await resolver.process(refresh_input); + + // Merge newly indexed into cached + for (auto& entry : refresh_result.cached) { + result.cached.push_back(std::move(entry)); + } + + // Update needs_checkpoint with any that still failed + result.needs_checkpoint = + std::move(refresh_result.needs_checkpoint); + } + + // Clear needs_aggregation since we processed them + result.needs_aggregation.clear(); + } + + DFTRACER_UTILS_LOG_INFO( + "Resolve complete: %zu total, %zu cached, %zu failed checkpoint", + result.all_files.size(), result.cached.size(), + result.needs_checkpoint.size()); + + co_return result; +} + +} // namespace dftracer::utils::utilities::composites::dft::indexing diff --git a/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp b/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp index 150a61d8..c5865387 100644 --- a/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp +++ b/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -8,12 +9,14 @@ namespace dftracer::utils::utilities::composites::dft::internal { -std::string determine_index_path(const std::string& file_path, +std::string determine_index_path(const std::string& path, const std::string& index_dir) { - fs::path data_path(file_path); - fs::path root = - index_dir.empty() ? data_path.parent_path() : fs::path(index_dir); - return (root / ".dftindex").string(); + fs::path data_path(path); + fs::path root = index_dir.empty() ? (fs::is_directory(data_path) + ? data_path + : data_path.parent_path()) + : fs::path(index_dir); + return indexer::internal::normalize_index_root(root.string()); } std::string determine_provenance_index_path(const std::string& data_path, diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp new file mode 100644 index 00000000..6a2a44f1 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp @@ -0,0 +1,852 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +// #include // re-enable with the DFT_MOCK_PADDED_STRIPE_BYTES block +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::reorganize { + +namespace { + +constexpr std::size_t DEFAULT_FLUSH_BYTES = 32 * 1024 * 1024; +constexpr std::size_t BUFFER_HEADROOM_BYTES = 1 * 1024 * 1024; + +coro::CoroTask compress_to_gzip_member(int level, ByteView data, + std::vector& out) { + out.clear(); + compression::zlib::ManualStreamingCompressorUtility comp( + level, compression::zlib::CompressionFormat::GZIP); + if (data.size() > 0) { + auto gen = comp.compress(data); + while (auto view = co_await gen.next()) { + const auto* p = + reinterpret_cast(view->data()); + out.insert(out.end(), p, p + view->size()); + } + } + auto fin = comp.finalize_stream(); + while (auto view = co_await fin.next()) { + const auto* p = reinterpret_cast(view->data()); + out.insert(out.end(), p, p + view->size()); + } + co_return; +} + +std::string make_chunk_path(const std::string& dir, int index, bool compress) { + return dir + "/chunk_chunk" + std::to_string(index) + ".pfw" + + (compress ? ".gz" : ""); +} + +struct WorkerBuf { + std::vector payload; + std::size_t lines_in_flush = 0; + std::vector flush_line_counts; +}; + +struct PendingSegment { + int source_file_idx; + int checkpoint_idx; + std::size_t worker_idx; + std::size_t flush_idx; + std::size_t offset_in_flush; + std::size_t count; +}; + +struct FlushTask { + std::vector payload; + std::uint64_t uc_offset = 0; + std::uint64_t line_count = 0; + std::uint64_t first_line_num = 0; + std::size_t dispatch_idx = 0; +}; + +struct IndexBatch { + std::shared_ptr payload; + std::size_t dispatch_idx = 0; + std::size_t worker_idx = 0; + std::uint64_t c_offset = 0; + std::uint64_t c_size = 0; + std::uint64_t uc_offset = 0; + std::uint64_t uc_size = 0; + std::uint64_t line_count = 0; + std::uint64_t first_line_num = 0; +}; + +constexpr std::size_t FLUSH_CHANNEL_CAPACITY = 3; +constexpr std::size_t INDEX_CHANNEL_CAPACITY = 8; + +struct InlineIndexState { + std::unique_ptr bloom; + std::unique_ptr hash_table; + std::unique_ptr manifest; + std::unique_ptr aggregation; + std::unique_ptr dispatcher; + std::shared_ptr> index_channel; + std::vector finalized_batches; + int file_id = -1; + std::unique_ptr sink; + std::uint64_t slice_uc_bytes = 0; + std::uint64_t sink_uc_bytes = 0; +}; + +constexpr std::uint64_t SLICE_UC_THRESHOLD = 64ULL * 1024 * 1024; +constexpr std::uint64_t SINK_UC_THRESHOLD = 1ULL * 1024 * 1024 * 1024; + +struct ChunkState { + std::unique_ptr writer; + std::string output_path; + fileio::parallel::LayoutInfo layout_info; + std::size_t num_workers = 1; + bool compress = false; + int compression_level = Z_DEFAULT_COMPRESSION; + std::size_t flush_threshold = 0; + std::size_t buffer_capacity = 0; + std::size_t bytes_uncompressed = 0; + std::size_t events_written = 0; + bool has_any_events = false; + std::vector workers; + std::vector segments; + std::vector compressed_scratch; + std::vector>> flush_channels; + + bool inline_index_enabled = false; + std::uint64_t inline_uc_dispatched = 0; + std::uint64_t inline_lines_dispatched = 0; + std::size_t inline_dispatch_counter = 0; + InlineIndexState inline_index; +}; + +coro::CoroTask run_flusher( + std::size_t worker_idx, std::shared_ptr> channel, + ChunkState* st) { + std::vector scratch; + while (auto task_opt = co_await channel->receive()) { + FlushTask& task = *task_opt; + if (task.payload.empty()) continue; + ByteView view(task.payload.data(), task.payload.size()); + int rc; + if (st->compress) { + scratch.clear(); + co_await compress_to_gzip_member(st->compression_level, view, + scratch); + rc = co_await st->writer->write_chunk( + worker_idx, + ByteView(reinterpret_cast(scratch.data()), + scratch.size())); + } else { + rc = co_await st->writer->write_chunk(worker_idx, view); + } + if (rc != 0) co_return false; + + if (st->inline_index_enabled) { + auto member = st->writer->last_member(worker_idx); + if (!member) co_return false; + IndexBatch batch; + batch.payload = std::make_shared(task.payload.data(), + task.payload.size()); + batch.dispatch_idx = task.dispatch_idx; + batch.worker_idx = worker_idx; + batch.c_offset = member->offset; + batch.c_size = member->length; + batch.uc_offset = task.uc_offset; + batch.uc_size = task.payload.size(); + batch.line_count = task.line_count; + batch.first_line_num = task.first_line_num; + if (!co_await st->inline_index.index_channel->send( + std::move(batch))) { + co_return false; + } + } + } + co_return true; +} + +void rotate_inline_sink(InlineIndexState& idx, + const GroupWriterConfig& config) { + auto a = idx.sink->commit(); + if (!a.empty()) config.artifacts_queue->enqueue(std::move(a)); + const auto next_idx = + config.batch_counter->fetch_add(1, std::memory_order_relaxed); + idx.sink = std::make_unique( + config.staging_root, "inline_" + std::to_string(next_idx)); + idx.sink_uc_bytes = 0; +} + +void flush_slice_visitors(InlineIndexState& idx, + const GroupWriterConfig& config) { + if (!idx.sink || idx.file_id < 0) return; + if (idx.bloom) { + idx.bloom->flush_per_checkpoint_to_sink(*idx.sink, idx.file_id); + } + if (idx.manifest) { + idx.manifest->flush_per_checkpoint_to_sink(*idx.sink, idx.file_id); + } + idx.sink_uc_bytes += idx.slice_uc_bytes; + idx.slice_uc_bytes = 0; + if (idx.sink_uc_bytes >= SINK_UC_THRESHOLD) { + rotate_inline_sink(idx, config); + } +} + +coro::CoroTask run_index_feeder(ChunkState* st, + const GroupWriterConfig* config) { + auto& idx = st->inline_index; + while (auto batch_opt = co_await idx.index_channel->receive()) { + IndexBatch batch = std::move(*batch_opt); + if (batch.payload && !batch.payload->empty()) { + co_await idx.dispatcher->on_chunk(batch.payload->data(), + batch.payload->size(), + batch.dispatch_idx); + co_await idx.dispatcher->on_checkpoint(batch.dispatch_idx); + } + idx.slice_uc_bytes += batch.uc_size; + batch.payload.reset(); + idx.finalized_batches.push_back(std::move(batch)); + if (idx.slice_uc_bytes >= SLICE_UC_THRESHOLD) { + co_await idx.dispatcher->flush(); + flush_slice_visitors(idx, *config); + } + } + co_return true; +} + +coro::CoroTask dispatch_flush(ChunkState& st, std::size_t w) { + auto& ww = st.workers[w]; + if (ww.payload.empty()) co_return true; + ww.flush_line_counts.push_back(ww.lines_in_flush); + FlushTask task; + task.payload = std::move(ww.payload); + task.line_count = ww.lines_in_flush; + task.uc_offset = st.inline_uc_dispatched; + task.first_line_num = st.inline_lines_dispatched; + task.dispatch_idx = st.inline_dispatch_counter++; + st.inline_uc_dispatched += task.payload.size(); + st.inline_lines_dispatched += task.line_count; + ww.payload = std::vector(); + ww.payload.reserve(st.buffer_capacity); + ww.lines_in_flush = 0; + bool ok = co_await st.flush_channels[w]->send(std::move(task)); + co_return ok; +} + +coro::CoroTask write_section(ChunkState& st, ByteView data, + bool is_footer) { + ByteView payload = data; + if (st.compress) { + co_await compress_to_gzip_member(st.compression_level, data, + st.compressed_scratch); + payload = ByteView( + reinterpret_cast(st.compressed_scratch.data()), + st.compressed_scratch.size()); + } + int rc = is_footer ? co_await st.writer->write_footer(payload) + : co_await st.writer->write_header(payload); + co_return rc == 0; +} + +coro::CoroTask open_chunk(ChunkState& st, const std::string& path, + bool compress, int compression_level, + std::size_t chunk_size_bytes, + std::size_t baseline_workers, CoroScope* scope, + const GroupWriterConfig& config) { + st.output_path = path; + st.compress = compress; + st.compression_level = compression_level; + st.bytes_uncompressed = 0; + st.events_written = 0; + st.has_any_events = false; + st.segments.clear(); + + st.layout_info = fileio::parallel::detect_layout(path); + + // Local validation aid: force padded-striped layout when no Lustre is + // available. Uncomment to exercise the padded-striped writer path. + // if (const char* mock = std::getenv("DFT_MOCK_PADDED_STRIPE_BYTES")) { + // const auto sz = + // static_cast(std::strtoull(mock, nullptr, 10)); + // if (sz >= fileio::parallel::MIN_PADDED_STRIPE_BYTES) { + // st.layout_info.layout = fileio::parallel::FileLayout::STRIPED; + // st.layout_info.stripe_size = sz; + // } + // } + + if (st.layout_info.layout == fileio::parallel::FileLayout::STRIPED && + st.layout_info.stripe_size == 0) { + st.layout_info.layout = fileio::parallel::FileLayout::SHARDED; + } + + const bool uses_padded = + st.layout_info.layout == fileio::parallel::FileLayout::STRIPED && + compress && + st.layout_info.stripe_size >= fileio::parallel::MIN_PADDED_STRIPE_BYTES; + const bool uses_sharded = + st.layout_info.layout == fileio::parallel::FileLayout::SHARDED; + // Plain striped writes at an atomic offset so cross-worker order is + // non-deterministic; keep one worker so we can resolve absolute line + // numbers. Padded-striped and sharded layouts expose deterministic + // worker slotting so we can fan out. + const std::size_t effective_baseline = + (uses_padded || uses_sharded) + ? std::max(baseline_workers, 1) + : 1; + const auto sizing = fileio::parallel::compute_writer_sizing( + st.layout_info, effective_baseline, DEFAULT_FLUSH_BYTES, + BUFFER_HEADROOM_BYTES, uses_padded); + st.num_workers = sizing.num_workers; + st.flush_threshold = sizing.flush_threshold; + st.buffer_capacity = sizing.buffer_capacity; + if (chunk_size_bytes > 0 && st.flush_threshold > chunk_size_bytes) { + st.flush_threshold = chunk_size_bytes; + } + + st.workers.clear(); + st.workers.resize(st.num_workers); + for (auto& w : st.workers) { + w.payload.reserve(st.buffer_capacity); + } + + fileio::parallel::WriterConfig wcfg; + wcfg.layout = st.layout_info.layout; + wcfg.stripe_size = st.layout_info.stripe_size; + wcfg.gzip = compress; + st.writer = fileio::parallel::make_writer(wcfg); + + if (co_await st.writer->open(path, st.num_workers, compress, scope) != 0) { + co_return false; + } + + st.inline_index_enabled = + !config.index_dir.empty() && config.artifacts_queue && + config.batch_counter && + (st.layout_info.layout == fileio::parallel::FileLayout::STRIPED || + st.layout_info.layout == fileio::parallel::FileLayout::SHARDED); + st.inline_uc_dispatched = 0; + st.inline_lines_dispatched = 0; + st.inline_dispatch_counter = 0; + if (st.inline_index_enabled) { + st.inline_index = InlineIndexState{}; + st.inline_index.bloom = std::make_unique( + config.bloom_config, config.bloom_dimensions); + st.inline_index.hash_table = + std::make_unique(); + st.inline_index.manifest = + std::make_unique(); + if (config.with_aggregation) { + aggregators::AggregationConfig agg_cfg; + agg_cfg.time_interval_us = + static_cast(config.agg_time_interval_us); + agg_cfg.compute_statistics = true; + agg_cfg.track_process_parents = true; + agg_cfg.track_default_args = true; + const std::size_t batch_idx = + config.batch_counter->fetch_add(1, std::memory_order_relaxed); + st.inline_index.aggregation = + std::make_unique( + config.staging_root, "agg_" + std::to_string(batch_idx), + /*config_hash=*/0u, agg_cfg, path); + } + DftEventDispatcher::VisitorList visitors; + visitors.emplace_back(*st.inline_index.bloom); + visitors.emplace_back(*st.inline_index.hash_table); + visitors.emplace_back(*st.inline_index.manifest); + if (st.inline_index.aggregation) { + visitors.emplace_back(*st.inline_index.aggregation); + } + st.inline_index.dispatcher = std::make_unique( + std::move(visitors), /*force_serial=*/true); + st.inline_index.dispatcher->begin(0); + st.inline_index.index_channel = + coro::make_channel(INDEX_CHANNEL_CAPACITY); + } + + const char header[] = "[\n"; + if (!co_await write_section( + st, ByteView(reinterpret_cast(header), 2), false)) { + co_return false; + } + co_return true; +} + +void emit_segments(const ChunkState& st, int chunk_idx, + ProvenanceTracker& prov) { + std::vector> abs_base(st.num_workers); + for (std::size_t w = 0; w < st.num_workers; ++w) { + abs_base[w].assign(st.workers[w].flush_line_counts.size(), 0); + } + std::size_t cum = 0; + const bool striped_parallel = + st.layout_info.layout == fileio::parallel::FileLayout::STRIPED && + st.num_workers > 1; + if (striped_parallel) { + std::size_t max_flushes = 0; + for (std::size_t w = 0; w < st.num_workers; ++w) { + max_flushes = + std::max(max_flushes, st.workers[w].flush_line_counts.size()); + } + for (std::size_t k = 0; k < max_flushes; ++k) { + for (std::size_t w = 0; w < st.num_workers; ++w) { + if (k >= st.workers[w].flush_line_counts.size()) continue; + abs_base[w][k] = cum; + cum += st.workers[w].flush_line_counts[k]; + } + } + } else { + for (std::size_t w = 0; w < st.num_workers; ++w) { + for (std::size_t k = 0; k < st.workers[w].flush_line_counts.size(); + ++k) { + abs_base[w][k] = cum; + cum += st.workers[w].flush_line_counts[k]; + } + } + } + for (const auto& seg : st.segments) { + std::size_t abs_start = + abs_base[seg.worker_idx][seg.flush_idx] + seg.offset_in_flush; + std::size_t abs_end = abs_start + seg.count - 1; + prov.record(seg.source_file_idx, seg.checkpoint_idx, chunk_idx, + static_cast(abs_start), static_cast(abs_end), + static_cast(seg.count)); + } +} + +coro::CoroTask dispatch_flush_all(ChunkState& st) { + for (std::size_t w = 0; w < st.num_workers; ++w) { + if (!co_await dispatch_flush(st, w)) co_return false; + } + co_return true; +} + +// Append a single line (plus trailing '\n') to worker w's payload. Does NOT +// flush; caller decides when to flush. +void append_line(ChunkState& st, std::size_t w, ByteView line) { + auto& ww = st.workers[w]; + const char* p = line.as(); + ww.payload.insert(ww.payload.end(), p, p + line.size()); + ww.payload.push_back('\n'); + ww.lines_in_flush += 1; + st.bytes_uncompressed += line.size() + 1; + st.events_written += 1; + st.has_any_events = true; +} + +} // namespace + +coro::CoroTask run_group_writer(CoroScope* scope, + GroupWriterConfig config) { + auto result = std::make_unique(); + result->group_name = config.group_name; + + try { + std::string group_output_dir = + config.output_dir + "/" + config.group_name; + if (!fs::exists(group_output_dir)) { + fs::create_directories(group_output_dir); + } + + auto provenance = std::make_unique(); + + int current_chunk_idx = 0; + std::vector chunks_info; + std::unique_ptr coord_db; + bool any_chunk_inline_indexed = false; + const bool inline_index_active = !config.index_dir.empty() && + config.artifacts_queue && + config.batch_counter; + if (inline_index_active) { + coord_db = + std::make_unique(config.index_dir); + coord_db->init_schema(); + } + + const std::size_t baseline_workers = + (scope && scope->get_executor()) + ? scope->get_executor()->get_num_threads() + : 1; + + auto open_inline_sink = [&](ChunkState& cs, const std::string& path) { + if (!cs.inline_index_enabled) return; + // Sharded writers don't materialize the merged path until + // finalize_chunk runs `merge_shards`. Touch it so register_files + // can stat/hash it now. + if (!fs::exists(path)) { + std::ofstream(path).close(); + } + std::vector ids = + coord_db->register_files({path}, /*build_manifest=*/true); + cs.inline_index.file_id = ids.empty() ? -1 : ids.front(); + const auto idx = + config.batch_counter->fetch_add(1, std::memory_order_relaxed); + cs.inline_index.sink = + std::make_unique( + config.staging_root, "inline_" + std::to_string(idx)); + cs.inline_index.slice_uc_bytes = 0; + cs.inline_index.sink_uc_bytes = 0; + }; + + constexpr std::size_t MAX_IN_FLIGHT_CHUNKS = 4; + auto sync_mutex = std::make_shared(); + auto inline_indexed_flag = std::make_shared>(false); + auto sem = coro::make_channel(MAX_IN_FLIGHT_CHUNKS); + + co_await scope->scope([&](CoroScope& group_scope) + -> coro::CoroTask { + for (std::size_t i = 0; i < MAX_IN_FLIGHT_CHUNKS; ++i) { + co_await sem->send(0); + } + + auto cs = std::make_shared(); + { + const auto path = make_chunk_path( + group_output_dir, current_chunk_idx, config.compress); + if (!co_await open_chunk( + *cs, path, config.compress, config.compression_level, + config.chunk_size_bytes, baseline_workers, &group_scope, + config)) { + throw std::runtime_error("Failed to open initial chunk"); + } + open_inline_sink(*cs, path); + } + + bool input_eof = false; + while (!input_eof) { + cs->flush_channels.clear(); + cs->flush_channels.reserve(cs->num_workers); + for (std::size_t i = 0; i < cs->num_workers; ++i) { + cs->flush_channels.push_back( + coro::make_channel(FLUSH_CHANNEL_CAPACITY)); + } + + co_await sem->receive(); + + const int captured_chunk_idx = current_chunk_idx; + const GroupWriterConfig* cfg_ptr = &config; + ProvenanceTracker* prov_ptr = provenance.get(); + std::vector* chunks_info_ptr = &chunks_info; + GroupWriterResult* result_raw = result.get(); + auto sync_mtx = sync_mutex; + auto indexed_flag = inline_indexed_flag; + auto sem_release = sem; + + group_scope.spawn([cs, captured_chunk_idx, cfg_ptr, prov_ptr, + chunks_info_ptr, result_raw, sync_mtx, + indexed_flag, + sem_release](CoroScope& orch_scope) + -> coro::CoroTask { + ChunkState* cs_p = cs.get(); + co_await orch_scope.scope([cs_p, + cfg_ptr](CoroScope& work_scope) + -> coro::CoroTask { + if (cs_p->inline_index_enabled) { + work_scope.spawn( + [cs_p, + cfg_ptr](CoroScope&) -> coro::CoroTask { + co_await run_index_feeder(cs_p, cfg_ptr); + }); + } + co_await work_scope.scope([cs_p](CoroScope& flush_scope) + -> coro::CoroTask { + for (std::size_t i = 0; i < cs_p->num_workers; + ++i) { + auto ch = cs_p->flush_channels[i]; + flush_scope.spawn( + [i, ch, + cs_p](CoroScope&) -> coro::CoroTask { + co_await run_flusher(i, ch, cs_p); + }); + } + co_return; + }); + if (cs_p->inline_index_enabled && + cs_p->inline_index.index_channel) { + cs_p->inline_index.index_channel->close(); + } + co_return; + }); + + if (cs_p->inline_index_enabled) { + co_await cs_p->inline_index.dispatcher->flush(); + } + + { + std::lock_guard lk(*sync_mtx); + emit_segments(*cs_p, captured_chunk_idx, *prov_ptr); + } + cs_p->segments.clear(); + + const char footer[] = "]\n"; + if (!co_await write_section( + *cs_p, + ByteView(reinterpret_cast(footer), 2), + true)) { + throw std::runtime_error("Failed to write footer"); + } + if (co_await cs_p->writer->close() != 0) { + throw std::runtime_error("Failed to close writer"); + } + if (cs_p->inline_index_enabled) { + auto bases = cs_p->writer->shard_base_offsets(); + if (!bases.empty()) { + for (auto& b : + cs_p->inline_index.finalized_batches) { + if (b.worker_idx < bases.size()) { + b.c_offset += bases[b.worker_idx]; + } + } + } + } + if (cs_p->layout_info.layout == + fileio::parallel::FileLayout::SHARDED) { + auto shards = cs_p->writer->output_paths(); + if (co_await fileio::parallel::merge_shards( + cs_p->output_path, shards) != 0) { + throw std::runtime_error("merge_shards failed"); + } + } + + { + std::lock_guard lk(*sync_mtx); + chunks_info_ptr->push_back(fileio::ChunkInfo{ + .path = cs_p->output_path, + .bytes_written = cs_p->bytes_uncompressed, + .events_written = cs_p->events_written, + .chunk_index = captured_chunk_idx, + }); + result_raw->output_files.push_back(cs_p->output_path); + auto span = cs_p->writer->member_layout(); + if (!span.empty()) { + ChunkMemberLayout layout; + layout.path = cs_p->output_path; + layout.members.assign(span.begin(), span.end()); + result_raw->chunk_layouts.push_back( + std::move(layout)); + } + } + + if (cs_p->inline_index_enabled && cs_p->inline_index.sink && + cs_p->inline_index.file_id >= 0) { + auto& idx = cs_p->inline_index; + + std::vector ordered; + ordered.reserve(idx.finalized_batches.size()); + for (auto& b : idx.finalized_batches) + ordered.push_back(&b); + std::sort(ordered.begin(), ordered.end(), + [](const IndexBatch* a, const IndexBatch* b) { + return a->c_offset < b->c_offset; + }); + std::uint64_t running = 0; + for (auto* b : ordered) { + b->first_line_num = running; + running += b->line_count; + } + for (const auto& b : idx.finalized_batches) { + indexer::internal::IndexerCheckpoint cp; + cp.checkpoint_idx = b.dispatch_idx; + cp.uc_offset = b.uc_offset; + cp.uc_size = b.uc_size; + cp.c_offset = b.c_offset; + cp.c_size = b.c_size; + cp.bits = 0; + cp.num_lines = b.line_count; + cp.first_line_num = b.first_line_num; + cp.last_line_num = + b.line_count > 0 + ? b.first_line_num + b.line_count - 1 + : b.first_line_num; + idx.sink->insert_checkpoint(idx.file_id, cp); + } + idx.sink->insert_file_metadata( + idx.file_id, /*checkpoint_size=*/0, + cs_p->inline_lines_dispatched, + cs_p->inline_uc_dispatched); + + if (idx.bloom) { + idx.bloom->finalize_file_to_sink(*idx.sink, + idx.file_id); + } + if (idx.manifest) { + idx.manifest->finalize_file_to_sink(*idx.sink, + idx.file_id); + } + if (idx.hash_table) { + idx.hash_table->finalize(*idx.sink, idx.file_id); + } + if (idx.aggregation) { + co_await idx.aggregation->on_file_complete(); + + aggregators::AggGlobalConfig agg_global; + agg_global.time_interval_us = + static_cast( + cfg_ptr->agg_time_interval_us); + agg_global.config_hash = 0; + idx.sink->insert_aggregation_put( + std::string_view( + aggregators::AGG_GLOBAL_CONFIG_KEY, 2), + aggregators::serialize_agg_global_config( + agg_global)); + idx.sink->insert_aggregation_put( + aggregators::make_agg_file_key(idx.file_id), + ""); + + for (auto& a : + idx.aggregation->aggregation_artifacts()) { + if (!a.empty()) { + cfg_ptr->artifacts_queue->enqueue( + std::move(a)); + } + } + } + auto a = idx.sink->commit(); + if (!a.empty()) { + cfg_ptr->artifacts_queue->enqueue(std::move(a)); + } + indexed_flag->store(true, std::memory_order_release); + } + + co_await sem_release->send(0); + }); + + bool rotated = false; + ChunkState* cs_ptr = cs.get(); + std::size_t batch_counter = 0; + + while (auto batch_opt = + co_await config.input_channel->receive()) { + LineBatch* batch_ptr = batch_opt->get(); + const std::size_t line_count = batch_ptr->lines.size(); + if (line_count == 0) continue; + + const std::size_t worker = + batch_counter++ % + std::max(cs_ptr->num_workers, 1); + const int src_file = + static_cast(batch_ptr->lines[0].source_file_idx); + const int ckpt = + static_cast(batch_ptr->lines[0].checkpoint_idx); + + std::size_t line_idx = 0; + while (line_idx < line_count) { + auto& ww = cs_ptr->workers[worker]; + PendingSegment seg{ + src_file, + ckpt, + worker, + ww.flush_line_counts.size(), + ww.lines_in_flush, + 0, + }; + bool inner_rotated = false; + + while (line_idx < line_count) { + auto view = batch_ptr->line_view(line_idx); + ByteView line(view.data(), view.size()); + append_line(*cs_ptr, worker, line); + seg.count++; + line_idx++; + if (ww.payload.size() >= cs_ptr->flush_threshold) { + if (!co_await dispatch_flush(*cs_ptr, worker)) { + throw std::runtime_error( + "Failed to dispatch flush"); + } + break; + } + if (config.chunk_size_bytes > 0 && + cs_ptr->bytes_uncompressed >= + config.chunk_size_bytes) { + inner_rotated = true; + break; + } + } + + if (seg.count > 0) cs_ptr->segments.push_back(seg); + if (inner_rotated) { + rotated = true; + break; + } + } + + result->events_written += line_count; + if (rotated) break; + } + + if (!co_await dispatch_flush_all(*cs_ptr)) { + throw std::runtime_error( + "Failed to dispatch trailing flush"); + } + for (auto& ch : cs_ptr->flush_channels) ch->close(); + + if (!rotated) { + input_eof = true; + break; + } + + current_chunk_idx++; + cs = std::make_shared(); + const auto next_path = make_chunk_path( + group_output_dir, current_chunk_idx, config.compress); + if (!co_await open_chunk( + *cs, next_path, config.compress, + config.compression_level, config.chunk_size_bytes, + baseline_workers, &group_scope, config)) { + throw std::runtime_error("Failed to open next chunk"); + } + open_inline_sink(*cs, next_path); + } + co_return; + }); + + if (inline_indexed_flag->load(std::memory_order_acquire)) { + any_chunk_inline_indexed = true; + } + + result->bytes_written = 0; + for (const auto& c : chunks_info) { + result->bytes_written += c.bytes_written; + } + result->chunks_created = chunks_info.size(); + + if (config.source_files) { + ExtractionPlan plan; + plan.source_files = *config.source_files; + co_await provenance->flush_to_db(plan, config.group_name, + config.group_query, chunks_info, + config.output_dir); + } + + if (any_chunk_inline_indexed) result->indexed_inline = true; + + result->success = true; + + } catch (const std::exception& e) { + result->error_message = e.what(); + DFTRACER_UTILS_LOG_ERROR("GroupWriter failed for %s: %s", + config.group_name.c_str(), e.what()); + } + + co_return std::move(*result); +} + +} // namespace dftracer::utils::utilities::composites::dft::reorganize diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp new file mode 100644 index 00000000..2cd8a036 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp @@ -0,0 +1,176 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace dftracer::utils::utilities::composites::dft::reorganize { + +namespace { + +struct LineGroupMapping { + std::unordered_map line_to_group; +}; + +bool query_matches_cat_name(const common::query::QueryNode& root, + const std::string& cat, const std::string& name) { + if (std::holds_alternative(root.data)) { + const auto& comp = std::get(root.data); + if (comp.op != common::query::CompareOp::EQ) return false; + + if (comp.field.path == "cat") { + if (std::holds_alternative(comp.value.value)) { + return std::get(comp.value.value) == cat; + } + } else if (comp.field.path == "name") { + if (std::holds_alternative(comp.value.value)) { + return std::get(comp.value.value) == name; + } + } + } + return false; +} + +LineGroupMapping build_line_group_mapping( + const std::vector& event_ranges, + const std::vector& groups, + const std::vector>& parsed_queries) { + LineGroupMapping mapping; + + for (const auto& range : event_ranges) { + std::size_t target_group = SIZE_MAX; + + for (std::size_t g = 0; g < groups.size(); ++g) { + const auto& query_opt = parsed_queries[g]; + if (!query_opt) { + target_group = g; + break; + } + + if (query_matches_cat_name(query_opt->root(), range.cat, + range.name)) { + target_group = g; + break; + } + } + + if (target_group != SIZE_MAX) { + for (std::uint32_t line_num : range.line_numbers) { + mapping.line_to_group[static_cast(line_num)] = + target_group; + } + } + } + + return mapping; +} + +} // namespace + +coro::CoroTask extract_from_manifest( + ManifestExtractorConfig config) { + ManifestExtractorResult result; + + try { + indexer::IndexDatabase db( + config.index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + + int file_id = db.get_file_info_id(config.file_path); + if (file_id < 0) { + result.error_message = + "File not found in index: " + config.file_path; + co_return result; + } + + if (!db.has_manifest_data(file_id)) { + result.error_message = + "No manifest data for file: " + config.file_path; + co_return result; + } + + auto event_ranges = db.query_event_ranges(file_id); + + std::vector> parsed_queries; + parsed_queries.reserve(config.groups.size()); + for (const auto& group : config.groups) { + if (group.query.empty()) { + parsed_queries.push_back(std::nullopt); + } else { + auto q = common::query::Query::from_string(group.query); + if (q) { + parsed_queries.push_back(std::move(*q)); + } else { + parsed_queries.push_back(std::nullopt); + } + } + } + + auto mapping = build_line_group_mapping(event_ranges, config.groups, + parsed_queries); + + std::vector pending_batches(config.groups.size()); + for (auto& batch : pending_batches) { + batch.reserve(config.batch_size); + } + + using fileio::lines::sources::async_streaming_gz_lines; + std::size_t line_num = 0; + auto gen = async_streaming_gz_lines(config.file_path); + + while (auto line_opt = co_await gen.next()) { + const auto& line = *line_opt; + + auto it = mapping.line_to_group.find(line_num); + if (it != mapping.line_to_group.end()) { + std::size_t group_idx = it->second; + auto& batch = pending_batches[group_idx]; + + batch.append_line(line.content, config.source_file_idx, + /*checkpoint_idx=*/0, line_num); + + result.events_extracted++; + + if (batch.size() >= config.batch_size) { + auto& channel = config.group_channels[group_idx]; + if (channel) { + co_await channel->send( + std::make_shared(std::move(batch))); + } + batch.clear(); + batch.reserve(config.batch_size); + } + } else { + result.events_unmatched++; + } + + line_num++; + } + + for (std::size_t i = 0; i < pending_batches.size(); ++i) { + auto& batch = pending_batches[i]; + if (!batch.empty()) { + auto& channel = config.group_channels[i]; + if (channel) { + co_await channel->send( + std::make_shared(std::move(batch))); + } + } + } + + result.success = true; + + } catch (const std::exception& e) { + result.error_message = e.what(); + DFTRACER_UTILS_LOG_ERROR("ManifestExtractor failed for %s: %s", + config.file_path.c_str(), e.what()); + } + + co_return result; +} + +} // namespace dftracer::utils::utilities::composites::dft::reorganize diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp new file mode 100644 index 00000000..fe315112 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp @@ -0,0 +1,145 @@ +#include + +namespace dftracer::utils::utilities::composites::dft::reorganize { + +OrganizeVisitor::OrganizeVisitor(OrganizeVisitorConfig config) + : config_(std::move(config)) { + parsed_queries_.reserve(config_.groups.size()); + for (const auto& group : config_.groups) { + if (group.query.empty()) { + parsed_queries_.push_back(std::nullopt); + } else { + auto result = common::query::Query::from_string(group.query); + if (result) { + parsed_queries_.push_back(std::move(*result)); + } else { + parsed_queries_.push_back(std::nullopt); + } + } + } + + pending_batches_.resize(config_.groups.size()); + for (auto& batch : pending_batches_) { + batch.reserve(config_.batch_size); + } + drain_queue_.resize(config_.groups.size()); +} + +void OrganizeVisitor::begin(std::size_t /*num_checkpoints*/) { + for (auto& batch : pending_batches_) { + batch.clear(); + } + for (auto& q : drain_queue_) { + q.clear(); + } + events_routed_ = 0; + events_unmatched_ = 0; +} + +void OrganizeVisitor::on_checkpoint(std::size_t checkpoint_idx) { + current_checkpoint_ = checkpoint_idx; +} + +std::size_t OrganizeVisitor::evaluate_event( + const DFTracerEvent& /*ev*/, const common::json::JsonValue& json) { + for (std::size_t i = 0; i < parsed_queries_.size(); ++i) { + const auto& query_opt = parsed_queries_[i]; + if (!query_opt) { + return i; + } + if (query_opt->evaluate(json)) { + return i; + } + } + return SIZE_MAX; +} + +void OrganizeVisitor::on_event(const EventRecord& record) { + if (record.ev.is_metadata()) { + return; + } + + std::size_t group_idx = evaluate_event(record.ev, record.json); + if (group_idx == SIZE_MAX) { + events_unmatched_++; + return; + } + + auto& batch = pending_batches_[group_idx]; + batch.append_line(record.line, config_.source_file_idx, + record.checkpoint_idx, record.line_number); + + events_routed_++; +} + +bool OrganizeVisitor::wants_drain() const noexcept { + for (std::size_t i = 0; i < pending_batches_.size(); ++i) { + if (!drain_queue_[i].empty()) return true; + if (pending_batches_[i].size() >= config_.batch_size) return true; + } + return false; +} + +coro::CoroTask OrganizeVisitor::drain_pending() { + for (std::size_t i = 0; i < pending_batches_.size(); ++i) { + auto& channel = config_.group_channels[i]; + // Send queued slice batches first. + for (auto& shared_batch : drain_queue_[i]) { + if (channel) { + co_await channel->send(std::move(shared_batch)); + } + } + drain_queue_[i].clear(); + // Then drain the threshold-triggered current batch. + auto& batch = pending_batches_[i]; + if (batch.size() < config_.batch_size) continue; + if (channel) { + co_await channel->send( + std::make_shared(std::move(batch))); + } + batch.clear(); + batch.reserve(config_.batch_size); + } +} + +coro::CoroTask OrganizeVisitor::on_file_complete() { + for (std::size_t i = 0; i < pending_batches_.size(); ++i) { + auto& channel = config_.group_channels[i]; + for (auto& shared_batch : drain_queue_[i]) { + if (channel) { + co_await channel->send(std::move(shared_batch)); + } + } + drain_queue_[i].clear(); + auto& batch = pending_batches_[i]; + if (batch.empty()) continue; + if (channel) { + co_await channel->send( + std::make_shared(std::move(batch))); + } + batch.clear(); + batch.reserve(config_.batch_size); + } +} + +std::unique_ptr OrganizeVisitor::create_parallel_slice() + const { + return std::make_unique(config_); +} + +void OrganizeVisitor::merge_parallel_slice(DftEventVisitor& slice_base) { + auto* slice = dynamic_cast(&slice_base); + if (!slice) return; + for (std::size_t i = 0; + i < drain_queue_.size() && i < slice->pending_batches_.size(); ++i) { + auto& src = slice->pending_batches_[i]; + if (src.empty()) continue; + drain_queue_[i].push_back(std::make_shared(std::move(src))); + src.clear(); + src.reserve(config_.batch_size); + } + events_routed_ += slice->events_routed_; + events_unmatched_ += slice->events_unmatched_; +} + +} // namespace dftracer::utils::utilities::composites::dft::reorganize diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp index 31a97cce..e50257b6 100644 --- a/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp @@ -1,10 +1,12 @@ #include #include -#include #include #include #include +#include +#include + namespace dftracer::utils::utilities::composites::dft::reorganize { void ProvenanceTracker::record(int source_file_idx, int checkpoint_idx, @@ -23,51 +25,39 @@ coro::CoroTask ProvenanceTracker::flush_to_db( using indexer::ProvenanceDatabase; for (const auto& chunk : chunks) { - auto provenance_path = std::make_shared( - indexer::determine_provenance_index_path(chunk.path)); - const auto* plan_ptr = &plan; - const auto* group_name_ptr = &group_name; - const auto* group_query_ptr = &group_query; - const auto* chunk_ptr = &chunk; - const auto* records_ptr = &records_; - try { - co_await rocksdb::run([plan_ptr, group_name_ptr, group_query_ptr, - chunk_ptr, records_ptr, provenance_path] { - ProvenanceDatabase pdb(*provenance_path); - pdb.init_schema(); - - std::uint64_t out_hash = 0; - if (fs::exists(chunk_ptr->path)) { - out_hash = static_cast( - fs::file_size(chunk_ptr->path)); - } - int fid = - pdb.get_or_create_file_info(chunk_ptr->path, out_hash); - - indexer::internal::TransactionScope txn(pdb); - pdb.insert_info(fid, "version", "2.0"); - pdb.insert_info(fid, "tool", "dftracer_organize"); - pdb.insert_group(fid, *group_name_ptr, *group_query_ptr); + std::string provenance_path = + indexer::determine_provenance_index_path(chunk.path); + ProvenanceDatabase pdb(provenance_path); + pdb.init_schema(); - for (std::size_t si = 0; si < plan_ptr->source_files.size(); - ++si) { - const auto& src = plan_ptr->source_files[si]; - pdb.insert_source(fid, static_cast(si), src.file_path, - static_cast(src.num_checkpoints)); - } + std::uint64_t out_hash = 0; + if (fs::exists(chunk.path)) { + out_hash = + static_cast(fs::file_size(chunk.path)); + } + int fid = pdb.get_or_create_file_info(chunk.path, out_hash); + pdb.insert_info(fid, "version", "2.0"); + pdb.insert_info(fid, "tool", "dftracer_organize"); + pdb.insert_group(fid, group_name, group_query); - for (const auto& rec : *records_ptr) { - if (rec.output_chunk_idx != chunk_ptr->chunk_index) - continue; - pdb.insert_segment(fid, rec.source_file_idx, - rec.checkpoint_idx, - rec.output_line_start, - rec.output_line_end, rec.event_count); - } + for (std::size_t si = 0; si < plan.source_files.size(); ++si) { + const auto& src = plan.source_files[si]; + pdb.insert_source(fid, static_cast(si), src.file_path, + static_cast(src.num_checkpoints)); + } - txn.commit(); - }); + std::unordered_map seq_counter; + for (const auto& rec : records_) { + if (rec.output_chunk_idx != chunk.chunk_index) continue; + std::uint64_t k = + (static_cast(rec.source_file_idx) << 32) | + static_cast(rec.checkpoint_idx); + int seq = seq_counter[k]++; + pdb.insert_segment(fid, rec.source_file_idx, rec.checkpoint_idx, + seq, rec.output_line_start, + rec.output_line_end, rec.event_count); + } } catch (const std::exception& e) { DFTRACER_UTILS_LOG_ERROR("Provenance write failed for %s: %s", chunk.path.c_str(), e.what()); diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp new file mode 100644 index 00000000..42c83158 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp @@ -0,0 +1,410 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::reorganize { + +using fileio::ChunkWriter; +using fileio::ChunkWriterConfig; +using indexer::SharedLineBuffer; + +ReconstructorInput& ReconstructorInput::with_input_dir(std::string dir) { + input_dir = std::move(dir); + return *this; +} + +ReconstructorInput& ReconstructorInput::with_output_dir(std::string dir) { + output_dir = std::move(dir); + return *this; +} + +ReconstructorInput& ReconstructorInput::with_checkpoint_size(std::size_t sz) { + checkpoint_size = sz; + return *this; +} + +ReconstructorInput& ReconstructorInput::with_parallelism(std::size_t n) { + parallelism = n; + return *this; +} + +ReconstructorInput& ReconstructorInput::with_compress(bool c) { + compress = c; + return *this; +} + +namespace { + +struct SegmentInterval { + int line_start; + int line_end; + std::size_t original_idx; // Index into original files vector +}; + +const SegmentInterval* find_segment( + const std::vector& intervals, int line_number) { + auto it = std::upper_bound( + intervals.begin(), intervals.end(), line_number, + [](int ln, const SegmentInterval& seg) { return ln < seg.line_start; }); + if (it != intervals.begin()) { + --it; + if (line_number >= it->line_start && line_number <= it->line_end) { + return &(*it); + } + } + return nullptr; +} + +std::string output_filename(const std::string& original_path) { + auto p = fs::path(original_path).filename().string(); + if (p.size() > 3 && p.substr(p.size() - 3) == ".gz") { + p = p.substr(0, p.size() - 3); + } + return p; +} + +struct ReconstructLineRecord { + SharedLineBuffer buffer; + std::string_view line; +}; + +struct ReconstructLineBatch { + std::vector lines; + + void reserve(std::size_t n) { lines.reserve(n); } + std::size_t size() const { return lines.size(); } + bool empty() const { return lines.empty(); } + void clear() { lines.clear(); } +}; + +struct WriterContext { + std::string output_dir; + bool compress; + std::atomic* total_events; + std::atomic* total_bytes; + std::vector* file_results; + std::mutex* results_mutex; +}; + +struct ReaderContext { + std::size_t checkpoint_size; + const std::vector* original_paths; + const std::vector>>* + channels; +}; + +// Named coroutine for writer task (CP.51 - no capturing lambda coroutines) +static coro::CoroTask run_writer( + std::shared_ptr> channel, + std::string orig_path, WriterContext wctx) { + std::string fname = output_filename(orig_path); + std::string base = fname; + if (base.size() > 4 && base.substr(base.size() - 4) == ".pfw") { + base = base.substr(0, base.size() - 4); + } + + auto config = ChunkWriterConfig() + .with_output_dir(wctx.output_dir) + .with_base_name(base) + .with_chunk_size(std::numeric_limits::max()) + .with_compression(wctx.compress); + + ChunkWriter writer(config); + co_await writer.open(); + + while (auto batch_opt = co_await channel->receive()) { + auto& batch = *batch_opt; + for (const auto& record : batch.lines) { + co_await writer.write_line( + ByteView(record.line.data(), record.line.size())); + } + } + + co_await writer.close(); + + ReconstructedFileInfo info; + info.original_path = std::move(orig_path); + info.events_written = writer.total_events_written(); + info.bytes_written = writer.total_bytes_written(); + if (!writer.chunks().empty()) { + info.output_path = writer.chunks().front().path; + } + + wctx.total_events->fetch_add(info.events_written); + wctx.total_bytes->fetch_add(info.bytes_written); + + { + std::lock_guard lock(*wctx.results_mutex); + wctx.file_results->push_back(std::move(info)); + } +} + +// Named coroutine for reader/producer task (CP.51) +static coro::CoroTask run_reader( + CoroScope& scope, std::string reorg_file, + const std::vector* intervals, ReaderContext rctx) { + std::string index_path = internal::determine_index_path(reorg_file, ""); + + MetadataCollectorUtility meta_collector; + auto meta_input = MetadataCollectorUtilityInput::from_file(reorg_file) + .with_index(index_path) + .with_checkpoint_size(rctx.checkpoint_size); + auto meta = co_await meta_collector.process(meta_input); + + auto reader_input = IndexedReadInput::from_file(reorg_file) + .with_index(index_path) + .with_checkpoint_size(rctx.checkpoint_size); + IndexedFileReaderUtility reader_utility; + auto reader = co_await reader_utility.process(reader_input); + + auto stream = reader->stream( + reader::internal::StreamConfig() + .stream_type(reader::internal::StreamType::MULTI_LINES_BYTES) + .range_type(reader::internal::RangeType::BYTE_RANGE) + .buffer_size(4 * 1024 * 1024) + .from(0) + .to(meta.uncompressed_size)); + + constexpr std::size_t BATCH_SIZE = 1024; + std::unordered_map pending_batches; + + int event_number = 0; + + while (!stream->done()) { + std::span chunk = co_await stream->read_async(); + if (chunk.empty()) break; + + auto buffer = std::make_shared( + chunk.data(), static_cast(chunk.size())); + + const char* data = buffer->data(); + std::size_t bytes_read = buffer->size(); + std::size_t pos = 0; + + while (pos < bytes_read) { + const char* line_start = data + pos; + const char* newline = static_cast( + std::memchr(line_start, '\n', bytes_read - pos)); + if (!newline) break; + + std::size_t line_len = + static_cast(newline - line_start); + + if (line_len > 0 && line_start[0] == '{') { + const auto* seg = find_segment(*intervals, event_number); + if (seg) { + auto& batch = pending_batches[seg->original_idx]; + ReconstructLineRecord record; + record.buffer = buffer; + record.line = std::string_view(line_start, line_len); + batch.lines.push_back(std::move(record)); + + if (batch.size() >= BATCH_SIZE) { + auto& channel = (*rctx.channels)[seg->original_idx]; + co_await channel->send(std::move(batch)); + batch.clear(); + } + } + event_number++; + } + + pos = static_cast(newline - data) + 1; + } + } + + for (auto& [idx, batch] : pending_batches) { + if (!batch.empty()) { + auto& channel = (*rctx.channels)[idx]; + co_await channel->send(std::move(batch)); + } + } + + (void)scope; +} + +} // namespace + +coro::CoroTask ReconstructorUtility::process( + const ReconstructorInput& input) { + ReconstructorResult result; + + if (!has_context()) { + result.error_message = "No context bound"; + co_return result; + } + CoroScope& ctx = context(); + + std::vector reorg_files; + if (fs::exists(input.input_dir)) { + filesystem::PatternDirectoryScannerUtility scanner; + filesystem::PatternDirectoryScannerUtilityInput scan_input{ + input.input_dir, {".pfw", ".pfw.gz"}, true}; + auto matched = co_await scanner.process(scan_input); + for (const auto& entry : matched) { + reorg_files.push_back(entry.path.string()); + } + } + + if (reorg_files.empty()) { + result.error_message = "No reorganized files found"; + co_return result; + } + + ReconstructionPlannerUtility planner; + ReconstructionPlannerInput planner_input; + planner_input.reorganized_files = reorg_files; + planner_input.index_dir = ""; + + ReconstructionPlan plan; + try { + plan = co_await planner.process(planner_input); + } catch (const std::exception& e) { + result.error_message = std::string("Planning failed: ") + e.what(); + co_return result; + } + + if (plan.files.empty()) { + result.success = true; + co_return result; + } + + result.total_segments = plan.total_segments; + + fs::create_directories(input.output_dir); + + // Build original paths vector and index map + std::vector original_paths; + std::unordered_map path_to_idx; + for (const auto& [orig_path, recon] : plan.files) { + path_to_idx[orig_path] = original_paths.size(); + original_paths.push_back(orig_path); + } + + // Build segment intervals using indices instead of strings + std::unordered_map> + per_reorg_segments; + for (const auto& [orig_path, recon] : plan.files) { + std::size_t orig_idx = path_to_idx[orig_path]; + for (const auto& [ckpt, segs] : recon.checkpoint_segments) { + for (const auto& seg : segs) { + SegmentInterval si; + si.line_start = seg.output_line_start; + si.line_end = seg.output_line_end; + si.original_idx = orig_idx; + per_reorg_segments[seg.reorg_file].push_back(si); + } + } + } + + for (auto& [file, segs] : per_reorg_segments) { + std::sort(segs.begin(), segs.end(), + [](const SegmentInterval& a, const SegmentInterval& b) { + return a.line_start < b.line_start; + }); + } + + // Create channels indexed by original file index + std::vector>> channels; + channels.reserve(original_paths.size()); + for (std::size_t i = 0; i < original_paths.size(); ++i) { + channels.push_back( + std::make_shared>(16)); + } + + std::atomic total_events{0}; + std::atomic total_bytes{0}; + std::vector file_results; + std::mutex results_mutex; + + WriterContext wctx; + wctx.output_dir = input.output_dir; + wctx.compress = input.compress; + wctx.total_events = &total_events; + wctx.total_bytes = &total_bytes; + wctx.file_results = &file_results; + wctx.results_mutex = &results_mutex; + + // Spawn writers (consumers) + for (std::size_t i = 0; i < original_paths.size(); ++i) { + ctx.spawn([channel = channels[i], orig_path = original_paths[i], + wctx](CoroScope&) -> coro::CoroTask { + co_await run_writer(channel, std::move(orig_path), wctx); + }); + } + + auto parallelism = input.parallelism > 0 + ? input.parallelism + : dftracer_utils_hardware_concurrency(); + + ReaderContext rctx; + rctx.checkpoint_size = input.checkpoint_size; + rctx.original_paths = &original_paths; + rctx.channels = &channels; + + auto* per_reorg_ptr = &per_reorg_segments; + auto* rctx_ptr = &rctx; + + co_await ctx.scope([per_reorg_ptr, rctx_ptr, parallelism]( + CoroScope& producer_scope) -> coro::CoroTask { + auto permits = coro::make_channel(parallelism * 2); + for (std::size_t i = 0; i < parallelism * 2; ++i) { + permits->try_send(true); + } + + for (auto& [reorg_file, intervals] : *per_reorg_ptr) { + const auto* intervals_ptr = &intervals; + auto reorg_file_copy = reorg_file; + + producer_scope.spawn( + [reorg_file_copy, intervals_ptr, rctx_ptr, + permits](CoroScope& s) -> coro::CoroTask { + co_await s.receive(permits); + try { + co_await run_reader(s, std::move(reorg_file_copy), + intervals_ptr, *rctx_ptr); + permits->try_send(true); + } catch (...) { + permits->try_send(true); + throw; + } + }); + } + + co_return; + }); + + // Producers done: close channels + for (auto& channel : channels) { + channel->close(); + } + + // Wait for writers + co_await ctx.join_all(); + + result.files = std::move(file_results); + result.total_events = total_events.load(); + result.total_bytes = total_bytes.load(); + result.success = true; + + co_return result; +} + +} // namespace dftracer::utils::utilities::composites::dft::reorganize diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp index b64904be..30b7431d 100644 --- a/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp @@ -1,8 +1,11 @@ #include #include +#include +#include #include #include #include +#include #include #include #include @@ -10,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -23,10 +26,12 @@ namespace dftracer::utils::utilities::composites::dft::reorganize { namespace { using common::query::Query; -using dftracer::utils::utilities::indexer::IndexBuildConfig; -using dftracer::utils::utilities::indexer::IndexBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility; +using dftracer::utils::utilities::indexer::IndexBuildBatchConfig; using dftracer::utils::utilities::indexer::IndexDatabase; using fileio::lines::sources::async_streaming_gz_lines; +using indexing::IndexResolverUtility; +using indexing::ResolverInput; } // namespace @@ -49,6 +54,12 @@ std::vector parse_group_specs( coro::CoroTask ReorganizationPlannerUtility::process( const ReorganizationPlannerInput& input) { + if (!has_context()) { + throw std::runtime_error( + "ReorganizationPlannerUtility requires CoroScope context"); + } + CoroScope& scope = context(); + ExtractionPlan plan; plan.groups = input.groups; @@ -67,8 +78,6 @@ coro::CoroTask ReorganizationPlannerUtility::process( } } - // Ensure "remainder" group exists if not already - // specified bool has_remainder = false; for (const auto& g : input.groups) { if (g.query.empty()) { @@ -90,41 +99,94 @@ coro::CoroTask ReorganizationPlannerUtility::process( } } - // Process each source file - for (std::size_t fi = 0; fi < input.source_files.size(); ++fi) { - const auto& file_path = input.source_files[fi]; + if (input.source_files.empty()) { + co_return plan; + } - // Build the shared `.dftindex` store if needed. - IndexBuilderUtility idx_builder; - auto idx_input = IndexBuildConfig::for_file(file_path).with_index_dir( - input.index_dir); + // Use IndexResolverUtility to scan files and check capabilities + IndexResolverUtility resolver; + ResolverInput resolver_input; + resolver_input.files = input.source_files; + resolver_input.index_dir = input.index_dir; + resolver_input.require_checkpoints = true; + resolver_input.require_manifest = true; + auto scan_result = co_await scope.spawn(resolver, resolver_input); + + DFTRACER_UTILS_LOG_INFO( + "ReorganizationPlanner: %zu files, %zu cached, %zu need checkpoint, " + "%zu need manifest", + scan_result.all_files.size(), scan_result.cached.size(), + scan_result.needs_checkpoint.size(), scan_result.needs_manifest.size()); + + // Build indices in parallel for files needing work + std::vector files_needing_index; + for (const auto& item : scan_result.needs_checkpoint) { + files_needing_index.push_back(item.file_path); + } + for (const auto& item : scan_result.needs_manifest) { + files_needing_index.push_back(item.file_path); + } + + if (!files_needing_index.empty()) { + auto batch_config = std::make_shared(); + batch_config->file_paths = std::move(files_needing_index); + batch_config->index_dir = input.index_dir; if (input.checkpoint_size > 0) { - idx_input.with_checkpoint_size(input.checkpoint_size); + batch_config->checkpoint_size = input.checkpoint_size; } - auto idx_result = co_await idx_builder.process(idx_input); - if (!idx_result.success) { - throw std::runtime_error("Failed to build index for: " + file_path); + batch_config->build_manifest = true; + batch_config->use_batch_write = true; + + auto batch_result = + co_await IndexBatchBuilderUtility::process(&scope, batch_config); + + for (const auto& result : batch_result.results) { + if (!result.success) { + throw std::runtime_error( + "Failed to build index for: " + result.file_path + ": " + + result.error_message); + } } + } - // Collect metadata - MetadataCollectorUtility metadata_collector; + // Collect metadata in parallel using when_all + // Store inputs in vector to ensure lifetime across co_await + std::vector meta_inputs; + meta_inputs.reserve(input.source_files.size()); + for (const auto& file_path : input.source_files) { + auto index_path = + internal::determine_index_path(file_path, input.index_dir); auto meta_input = MetadataCollectorUtilityInput::from_file(file_path).with_index( - idx_result.index_path); + index_path); if (input.checkpoint_size > 0) { meta_input.with_checkpoint_size(input.checkpoint_size); } - auto meta = co_await metadata_collector.process(meta_input); + meta_inputs.push_back(std::move(meta_input)); + } + + std::vector> metadata_tasks; + metadata_tasks.reserve(meta_inputs.size()); + for (const auto& meta_input : meta_inputs) { + MetadataCollectorUtility collector; + metadata_tasks.push_back(collector.process(meta_input)); + } + + auto metadata_results = co_await coro::when_all(std::move(metadata_tasks)); + + // Build source file info from metadata results (same order as input) + plan.source_files.reserve(input.source_files.size()); + for (std::size_t fi = 0; fi < input.source_files.size(); ++fi) { + const auto& file_path = input.source_files[fi]; + const auto& meta = metadata_results[fi]; + if (!meta.success) { throw std::runtime_error("Failed to collect metadata for: " + file_path); } - // Determine the root-local `.dftindex` store path. std::string index_path = internal::determine_index_path(file_path, input.index_dir); - - // Effective checkpoint count: treat 0 as 1 std::size_t eff_ckpts = meta.num_checkpoints > 0 ? meta.num_checkpoints : 1; @@ -135,12 +197,16 @@ coro::CoroTask ReorganizationPlannerUtility::process( sfi.uncompressed_size = meta.uncompressed_size; sfi.checkpoint_size = meta.checkpoint_size; plan.source_files.push_back(std::move(sfi)); + } + + // Plan extraction tasks for each file + for (std::size_t fi = 0; fi < input.source_files.size(); ++fi) { + const auto& file_path = input.source_files[fi]; + const auto& meta = metadata_results[fi]; + const auto& sfi = plan.source_files[fi]; - // Open the shared index store and try manifest-based planning. Fall - // back to whole-file streaming when manifest tables are absent (file - // was below index_threshold). IndexDatabase idx_db( - index_path, + sfi.index_path, dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); int file_info_id = idx_db.get_file_info_id( indexer::internal::get_logical_path(file_path)); @@ -150,6 +216,7 @@ coro::CoroTask ReorganizationPlannerUtility::process( } const bool has_manifest = idx_db.has_manifest_data(file_info_id); + const std::size_t eff_ckpts = sfi.num_checkpoints; if (has_manifest) { // Manifest-based planning: per-checkpoint extraction tasks. @@ -216,23 +283,12 @@ coro::CoroTask ReorganizationPlannerUtility::process( } else { // Whole-file fallback: stream line-by-line, route each // event to a group, emit one task per group covering - // the entire file. Only enabled for files at or below - // the default index threshold to avoid pathological - // memory usage on large traces with corrupt/partial - // indexes. - auto file_size = fs::file_size(file_path); - if (file_size > constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD) { - throw std::runtime_error( - "Manifest tables missing for large file (>" + - std::to_string( - constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD) + - " bytes): " + file_path + - ". Re-index with index_threshold=0."); - } - + // the entire file. std::map> group_lines; std::vector meta_line_numbers; + simdjson::dom::parser parser; + auto gen = async_streaming_gz_lines(file_path); while (auto line_opt = co_await gen.next()) { const auto& line = *line_opt; @@ -252,42 +308,33 @@ coro::CoroTask ReorganizationPlannerUtility::process( continue; } - yyjson_doc* doc = - yyjson_read(begin, static_cast(end - begin), - YYJSON_READ_NOFLAG); - if (!doc) continue; + auto result = + parser.parse(begin, static_cast(end - begin)); + if (result.error()) continue; - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); - continue; - } + auto root = result.value_unsafe(); + if (!root.is_object()) continue; auto line_num = static_cast(line.line_number); - yyjson_val* ph_val = yyjson_obj_get(root, "ph"); + auto ph_result = root["ph"].get_string(); const bool is_metadata = - ph_val && yyjson_is_str(ph_val) && - std::string_view(yyjson_get_str(ph_val), - yyjson_get_len(ph_val)) == "M"; + !ph_result.error() && ph_result.value_unsafe() == "M"; if (is_metadata) { meta_line_numbers.push_back(line_num); - yyjson_doc_free(doc); continue; } std::string cat_str; - if (yyjson_val* cat_val = yyjson_obj_get(root, "cat"); - cat_val && yyjson_is_str(cat_val)) { - cat_str.assign(yyjson_get_str(cat_val), - yyjson_get_len(cat_val)); + auto cat_result = root["cat"].get_string(); + if (!cat_result.error()) { + cat_str = std::string(cat_result.value_unsafe()); } std::string name_str; - if (yyjson_val* name_val = yyjson_obj_get(root, "name"); - name_val && yyjson_is_str(name_val)) { - name_str.assign(yyjson_get_str(name_val), - yyjson_get_len(name_val)); + auto name_result = root["name"].get_string(); + if (!name_result.error()) { + name_str = std::string(name_result.value_unsafe()); } bool matched = false; @@ -306,8 +353,6 @@ coro::CoroTask ReorganizationPlannerUtility::process( group_lines[remainder_name].push_back(line_num); } plan.total_events++; - - yyjson_doc_free(doc); } for (auto& [gname, lines] : group_lines) { diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp index 8664442d..4a0611a3 100644 --- a/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp @@ -1,11 +1,12 @@ #include #include +#include #include #include #include #include #include -#include +#include #include #include @@ -14,18 +15,13 @@ #include #include -// Import JsonValue from common json namespace using dftracer::utils::utilities::common::json::JsonValue; using dftracer::utils::utilities::composites::dft::DFTracerEvent; namespace dftracer::utils::utilities::composites::dft::statistics { -// Constant for global (non-grouped) I/O metrics inline constexpr std::string_view GLOBAL_GROUP_KEY = "__global__"; -// Event names where args.ret represents bytes transferred (actual I/O). -// Other syscalls like lseek64 (returns offset) and fork (returns PID) -// have ret with different semantics and must be excluded. static constexpr auto IO_EVENT_NAMES = std::to_array({"read", "write", "pread", "pwrite", "pread64", "pwrite64", "readv", "writev"}); @@ -35,46 +31,49 @@ static bool is_io_event(std::string_view name) { IO_EVENT_NAMES.end(); } -// Build a composite group key from the requested dimensions. static void build_group_key(std::string& key, const std::vector& group_by, - const JsonValue& json, const JsonValue& args) { + const DFTracerEvent& ev) { key.clear(); for (std::size_t i = 0; i < group_by.size(); ++i) { if (i > 0) key.push_back('|'); const auto& dim = group_by[i]; if (dim == "name") { - key += json["name"].get(); + key += ev.name; } else if (dim == "cat") { - key += json["cat"].get(); - } else if (dim == "pid" || dim == "tid") { - std::uint64_t val = json[dim].get(); + key += ev.cat; + } else if (dim == "pid") { char buf[32]; - auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), val); + auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), ev.pid); + if (ec == std::errc()) { + key.append(buf, ptr - buf); + } + } else if (dim == "tid") { + char buf[32]; + auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), ev.tid); if (ec == std::errc()) { key.append(buf, ptr - buf); } } else if (dim == "pid_tid") { - std::uint64_t pid = json["pid"].get(); - std::uint64_t tid = json["tid"].get(); char buf[64]; - auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), pid); + auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), ev.pid); if (ec == std::errc()) { key.append(buf, ptr - buf); key.push_back(':'); - auto [ptr2, ec2] = std::to_chars(ptr, buf + sizeof(buf), tid); + auto [ptr2, ec2] = + std::to_chars(ptr, buf + sizeof(buf), ev.tid); if (ec2 == std::errc()) { key.append(ptr, ptr2 - ptr); } } } else if (dim == "fhash") { - if (args.exists()) { - key += args["fhash"].get(); + if (ev.args.exists()) { + key += ev.args["fhash"].get(); } } else if (dim == "hhash") { - if (args.exists()) { - key += args["hhash"].get(); + if (ev.args.exists()) { + key += ev.args["hhash"].get(); } } } @@ -85,7 +84,6 @@ coro::CoroTask ChunkDetailScannerUtility::process( ChunkDetailScanOutput output; output.success = false; - // Build filter sets for O(1) lookup std::unordered_set name_filter; std::unordered_set cat_filter; if (input.filter_names) { @@ -103,7 +101,6 @@ coro::CoroTask ChunkDetailScannerUtility::process( bool has_cat_filter = !cat_filter.empty(); bool has_grouping = input.group_by && !input.group_by->empty(); - // Create reader (same pattern as chunk_indexer_utility.cpp) auto reader_input = composites::IndexedReadInput::from_file(input.file_path) .with_checkpoint_size(input.checkpoint_size) .with_index(input.index_path); @@ -141,9 +138,7 @@ coro::CoroTask ChunkDetailScannerUtility::process( group_key_buf.reserve(128); static const std::string global_key{GLOBAL_GROUP_KEY}; - char yy_buf[common::json::YYJSON_LINE_POOL_SIZE]; - yyjson_alc yy_alc; - yyjson_alc_pool_init(&yy_alc, yy_buf, sizeof(yy_buf)); + simdjson::dom::parser parser; while (!stream->done()) { auto chunk = co_await stream->read_async(); @@ -168,26 +163,18 @@ coro::CoroTask ChunkDetailScannerUtility::process( std::size_t line_len = newline - line_start; if (line_len > 0) { - yyjson_read_flag flg = YYJSON_READ_NOFLAG; - yyjson_doc* doc = - yyjson_read_opts(const_cast(line_start), line_len, - flg, &yy_alc, nullptr); - - if (doc) { - yyjson_val* root = yyjson_doc_get_root(doc); - if (root && yyjson_is_obj(root)) { + auto result = parser.parse(line_start, line_len); + if (!result.error()) { + auto root = result.value_unsafe(); + if (root.is_object()) { JsonValue json(root); DFTracerEvent ev; if (!DFTracerEvent::parse(json, ev)) { - yyjson_doc_free(doc); pos = (newline - data) + 1; continue; } if (!ev.is_metadata()) { - // Regular event - - // Apply filters bool passes = true; if (has_name_filter && name_filter.find(ev.name) == name_filter.end()) { @@ -201,21 +188,17 @@ coro::CoroTask ChunkDetailScannerUtility::process( if (passes) { double dur = static_cast(ev.dur); - // Global duration output.stats.duration.update(dur); - // Determine I/O key for this event bool is_io = is_io_event(ev.name); const std::string* io_key_ptr; if (has_grouping) { build_group_key(group_key_buf, - *input.group_by, json, - ev.args); + *input.group_by, ev); output.stats.grouped_duration[group_key_buf] .update(dur); - // Only insert category on first occurrence output.stats.group_key_category.try_emplace( group_key_buf, ev.cat); io_key_ptr = &group_key_buf; @@ -223,7 +206,6 @@ coro::CoroTask ChunkDetailScannerUtility::process( io_key_ptr = &global_key; } - // I/O metrics: only for actual I/O events if (is_io && ev.args.exists()) { auto ret_opt = ev.args["ret"] @@ -255,7 +237,6 @@ coro::CoroTask ChunkDetailScannerUtility::process( } } } - yyjson_doc_free(doc); } } diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp index 57682886..856843cc 100644 --- a/src/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp +++ b/src/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp @@ -1,8 +1,9 @@ #include -#include #include #include +#include +#include namespace dftracer::utils::utilities::composites::dft::statistics { @@ -71,96 +72,87 @@ void DetailedStatistics::merge(const DetailedStatistics& other) { chunks_skipped += other.chunks_skipped; } -// Helper: serialize a DistributionStats into a yyjson mutable object -static yyjson_mut_val* distribution_to_json(yyjson_mut_doc* doc, - const DistributionStats& dist) { - yyjson_mut_val* obj = yyjson_mut_obj(doc); - - yyjson_mut_obj_add_uint(doc, obj, "count", dist.count()); - yyjson_mut_obj_add_real(doc, obj, "sum", dist.sum); - yyjson_mut_obj_add_real(doc, obj, "mean", dist.mean()); - yyjson_mut_obj_add_real(doc, obj, "stddev", dist.stddev()); +namespace { +void write_distribution_json(std::ostringstream& ss, + const DistributionStats& dist) { + ss << "{\"count\":" << dist.count(); + ss << ",\"sum\":" << dist.sum; + ss << ",\"mean\":" << dist.mean(); + ss << ",\"stddev\":" << dist.stddev(); if (dist.count() > 0 && !dist.sketch.empty()) { - yyjson_mut_obj_add_real(doc, obj, "min", dist.sketch.min()); - yyjson_mut_obj_add_real(doc, obj, "max", dist.sketch.max()); - - yyjson_mut_val* pctls = yyjson_mut_obj(doc); - yyjson_mut_obj_add_real(doc, pctls, "p10", dist.sketch.quantile(0.1)); - yyjson_mut_obj_add_real(doc, pctls, "p25", dist.sketch.quantile(0.25)); - yyjson_mut_obj_add_real(doc, pctls, "p50", dist.sketch.quantile(0.5)); - yyjson_mut_obj_add_real(doc, pctls, "p75", dist.sketch.quantile(0.75)); - yyjson_mut_obj_add_real(doc, pctls, "p90", dist.sketch.quantile(0.9)); - yyjson_mut_obj_add_real(doc, pctls, "p95", dist.sketch.quantile(0.95)); - yyjson_mut_obj_add_real(doc, pctls, "p99", dist.sketch.quantile(0.99)); - yyjson_mut_obj_add_val(doc, obj, "percentiles", pctls); + ss << ",\"min\":" << dist.sketch.min(); + ss << ",\"max\":" << dist.sketch.max(); + ss << ",\"percentiles\":{"; + ss << "\"p10\":" << dist.sketch.quantile(0.1); + ss << ",\"p25\":" << dist.sketch.quantile(0.25); + ss << ",\"p50\":" << dist.sketch.quantile(0.5); + ss << ",\"p75\":" << dist.sketch.quantile(0.75); + ss << ",\"p90\":" << dist.sketch.quantile(0.9); + ss << ",\"p95\":" << dist.sketch.quantile(0.95); + ss << ",\"p99\":" << dist.sketch.quantile(0.99); + ss << '}'; } - // Direct serialization to avoid string roundtrip - yyjson_mut_val* hist_val = dist.histogram.to_yyjson(doc); - yyjson_mut_obj_add_val(doc, obj, "histogram", hist_val); - - return obj; + ss << ",\"histogram\":" << dist.histogram.to_json(); + ss << '}'; } -// Helper: serialize IOEventMetrics into a yyjson mutable object -static yyjson_mut_val* io_metrics_to_json(yyjson_mut_doc* doc, - const IOEventMetrics& io) { - yyjson_mut_val* obj = yyjson_mut_obj(doc); - yyjson_mut_obj_add_val(doc, obj, "duration", - distribution_to_json(doc, io.duration)); - yyjson_mut_obj_add_val(doc, obj, "size", - distribution_to_json(doc, io.size)); +void write_io_metrics_json(std::ostringstream& ss, const IOEventMetrics& io) { + ss << "{\"duration\":"; + write_distribution_json(ss, io.duration); + ss << ",\"size\":"; + write_distribution_json(ss, io.size); if (io.bandwidth.count() > 0) { - yyjson_mut_obj_add_val(doc, obj, "bandwidth", - distribution_to_json(doc, io.bandwidth)); + ss << ",\"bandwidth\":"; + write_distribution_json(ss, io.bandwidth); } if (io.offset.count() > 0) { - yyjson_mut_obj_add_val(doc, obj, "offset", - distribution_to_json(doc, io.offset)); + ss << ",\"offset\":"; + write_distribution_json(ss, io.offset); } - return obj; + ss << '}'; } +} // namespace std::string DetailedStatistics::to_json() const { - yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr); - yyjson_mut_val* root = yyjson_mut_obj(doc); - yyjson_mut_doc_set_root(doc, root); + std::ostringstream ss; + ss << std::setprecision(17); + ss << '{'; - // Scan progress - yyjson_mut_obj_add_uint(doc, root, "events_scanned", events_scanned); - yyjson_mut_obj_add_uint(doc, root, "chunks_scanned", chunks_scanned); - yyjson_mut_obj_add_uint(doc, root, "chunks_skipped", chunks_skipped); + ss << "\"events_scanned\":" << events_scanned; + ss << ",\"chunks_scanned\":" << chunks_scanned; + ss << ",\"chunks_skipped\":" << chunks_skipped; - // Global duration - yyjson_mut_obj_add_val(doc, root, "duration", - distribution_to_json(doc, duration)); + ss << ",\"duration\":"; + write_distribution_json(ss, duration); - // Grouped duration if (!grouped_duration.empty()) { - yyjson_mut_val* gd = yyjson_mut_obj(doc); + ss << ",\"grouped_duration\":{"; + bool first = true; for (const auto& [key, dist] : grouped_duration) { - yyjson_mut_obj_add_val(doc, gd, key.c_str(), - distribution_to_json(doc, dist)); + if (!first) ss << ','; + first = false; + ss << '"' << key << "\":"; + write_distribution_json(ss, dist); } - yyjson_mut_obj_add_val(doc, root, "grouped_duration", gd); + ss << '}'; } - // Grouped I/O if (!grouped_io.empty()) { - yyjson_mut_val* gio = yyjson_mut_obj(doc); + ss << ",\"grouped_io\":{"; + bool first = true; for (const auto& [key, io] : grouped_io) { - yyjson_mut_obj_add_val(doc, gio, key.c_str(), - io_metrics_to_json(doc, io)); + if (!first) ss << ','; + first = false; + ss << '"' << key << "\":"; + write_io_metrics_json(ss, io); } - yyjson_mut_obj_add_val(doc, root, "grouped_io", gio); + ss << '}'; } - char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_PRETTY, nullptr); - std::string result(json_str ? json_str : "{}"); - if (json_str) free(json_str); - yyjson_mut_doc_free(doc); - return result; + ss << '}'; + return ss.str(); } } // namespace dftracer::utils::utilities::composites::dft::statistics diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp new file mode 100644 index 00000000..80870278 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp @@ -0,0 +1,5 @@ +#include + +namespace dftracer::utils::utilities::composites::dft::statistics { + +} // namespace dftracer::utils::utilities::composites::dft::statistics diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp index 6f34cdd0..8a72329f 100644 --- a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp @@ -1,16 +1,16 @@ #include -#include #include #include #include #include #include #include -#include +#include namespace dftracer::utils::utilities::composites::dft::statistics { using dftracer::utils::utilities::common::json::JsonValue; +using dftracer::utils::utilities::indexer::ChunkStatisticsResult; using dftracer::utils::utilities::indexer::IndexDatabase; using dftracer::utils::utilities::indexer::internal::get_logical_path; using fileio::lines::sources::async_streaming_gz_lines; @@ -35,33 +35,29 @@ coro::CoroTask StatisticsAggregatorUtility::process( } bool needs_streaming_fallback = false; - auto do_query = [&input, &result, - &needs_streaming_fallback]() -> TraceStatistics { - try { - IndexDatabase idx_db(result.index_path); - - int fid = - idx_db.get_file_info_id(get_logical_path(input.file_path)); - if (fid < 0) { - result.success = false; - result.error_message = - "File not found in index: " + input.file_path; - return result; - } + try { + IndexDatabase idx_db(result.index_path); - std::vector chunks; - try { - chunks = idx_db.query_chunk_statistics(fid); - } catch (const std::exception&) { - needs_streaming_fallback = true; - return result; - } + int fid = idx_db.get_file_info_id(get_logical_path(input.file_path)); + if (fid < 0) { + result.success = false; + result.error_message = + "File not found in index: " + input.file_path; + co_return result; + } - if (chunks.empty()) { - needs_streaming_fallback = true; - return result; - } + std::vector chunks; + try { + chunks = idx_db.query_chunk_statistics(fid); + } catch (const std::exception&) { + needs_streaming_fallback = true; + } + if (!needs_streaming_fallback && chunks.empty()) { + needs_streaming_fallback = true; + } + + if (!needs_streaming_fallback) { result.num_chunks = chunks.size(); result.merged = chunks[0].stats; for (std::size_t i = 1; i < chunks.size(); ++i) { @@ -70,6 +66,8 @@ coro::CoroTask StatisticsAggregatorUtility::process( auto dim_stats = idx_db.query_chunk_dimension_stats(fid); for (const auto& ds : dim_stats) { + if (!ds.has_value_counts_payload()) continue; + ds.ensure_value_counts_decoded(); if (!ds.value_counts) continue; if (ds.dimension == "cat") { for (const auto& [k, v] : *ds.value_counts) @@ -84,14 +82,13 @@ coro::CoroTask StatisticsAggregatorUtility::process( } result.success = true; - } catch (const std::exception& e) { - result.success = false; - result.error_message = e.what(); + co_return result; } - return result; - }; - - result = co_await rocksdb::run(do_query); + } catch (const std::exception& e) { + result.success = false; + result.error_message = e.what(); + co_return result; + } if (!needs_streaming_fallback) { co_return result; @@ -104,24 +101,21 @@ coro::CoroTask StatisticsAggregatorUtility::process( } /// Sequential fallback: stream the file line-by-line and compute - /// statistics on-the-fly when the index has no chunk_statistics - /// (e.g. file was below the index_threshold). + /// statistics on-the-fly when the index has no chunk_statistics. try { indexing::ChunkStatistics stats; + simdjson::dom::parser parser; auto gen = async_streaming_gz_lines(input.file_path); while (auto line_opt = co_await gen.next()) { const auto& line = *line_opt; if (line.content.empty()) continue; - yyjson_doc* doc = yyjson_read( - line.content.data(), line.content.size(), YYJSON_READ_NOFLAG); - if (!doc) continue; + auto parse_result = + parser.parse(line.content.data(), line.content.size()); + if (parse_result.error()) continue; - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); - continue; - } + auto root = parse_result.value_unsafe(); + if (!root.is_object()) continue; try { JsonValue json(root); @@ -138,11 +132,7 @@ coro::CoroTask StatisticsAggregatorUtility::process( stats.update_from_event(name, cat, pid, tid, ts, dur); } } catch (const std::exception&) { - // Skip malformed or partial events without - // aborting the entire aggregation. } - - yyjson_doc_free(doc); } result.merged = std::move(stats); @@ -156,4 +146,104 @@ coro::CoroTask StatisticsAggregatorUtility::process( co_return result; } +coro::CoroTask> +StatisticsAggregatorUtility::process_batch( + const StatisticsAggregatorBatchInput& input) { + if (input.file_paths.empty()) { + co_return std::vector{}; + } + + const auto& index_path = input.index_path; + if (!fs::exists(index_path)) { + std::vector results; + results.reserve(input.file_paths.size()); + for (const auto& fp : input.file_paths) { + TraceStatistics r; + r.file_path = fp; + r.index_path = index_path; + r.success = false; + r.error_message = "Index store not found: " + index_path; + results.push_back(std::move(r)); + } + co_return results; + } + + const auto& files = input.file_paths; + std::vector results; + results.resize(files.size()); + for (std::size_t i = 0; i < files.size(); ++i) { + results[i].file_path = files[i]; + results[i].index_path = index_path; + } + + try { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + + std::vector file_ids(files.size(), -1); + for (std::size_t i = 0; i < files.size(); ++i) { + file_ids[i] = db.get_file_info_id(get_logical_path(files[i])); + if (file_ids[i] < 0) { + results[i].success = false; + results[i].error_message = + "File not found in index: " + files[i]; + } + } + + std::vector valid_ids; + valid_ids.reserve(files.size()); + for (std::size_t i = 0; i < files.size(); ++i) { + if (file_ids[i] >= 0) valid_ids.push_back(file_ids[i]); + } + + auto scalar_batch = db.query_file_scalar_stats_batch(valid_ids); + auto cat_batch = db.query_file_category_counts_batch(valid_ids); + auto pid_tid_batch = db.query_file_pid_tid_counts_batch(valid_ids); + auto name_batch = db.query_file_name_summaries_batch(valid_ids); + + for (std::size_t i = 0; i < files.size(); ++i) { + if (file_ids[i] < 0) continue; + const int fid = file_ids[i]; + + auto scalar_it = scalar_batch.find(fid); + if (scalar_it == scalar_batch.end()) { + results[i].success = false; + results[i].error_message = + "No file summary in index for: " + files[i]; + continue; + } + + results[i].merged = scalar_it->second.stats; + results[i].num_chunks = scalar_it->second.num_chunks; + results[i].success = true; + + auto cat_it = cat_batch.find(fid); + if (cat_it != cat_batch.end()) { + results[i].merged.category_counts = std::move(cat_it->second); + } + + auto pid_it = pid_tid_batch.find(fid); + if (pid_it != pid_tid_batch.end()) { + results[i].merged.pid_tid_counts = std::move(pid_it->second); + } + + auto name_it = name_batch.find(fid); + if (name_it != name_batch.end()) { + results[i].merged.name_counts = + std::move(name_it->second.counts); + } + } + } catch (const std::exception& e) { + for (std::size_t i = 0; i < files.size(); ++i) { + if (!results[i].success && results[i].error_message.empty()) { + results[i].success = false; + results[i].error_message = e.what(); + } + } + } + + co_return results; +} + } // namespace dftracer::utils::utilities::composites::dft::statistics diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp index f5f8b641..f70933b6 100644 --- a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp @@ -1,24 +1,29 @@ #include -#include #include #include +#include #include +#include namespace dftracer::utils::utilities::composites::dft::statistics { namespace { -std::vector> sorted_desc( - const std::unordered_map& m) { +template +std::vector> sorted_desc(const Map& m) { std::vector> v(m.begin(), m.end()); std::sort(v.begin(), v.end(), [](const auto& a, const auto& b) { return a.second > b.second; }); return v; } -std::vector> top_n( - const std::unordered_map& m, std::uint64_t n) { +template +std::vector> top_n(const Map& m, + std::uint64_t n) { auto v = sorted_desc(m); + if (n == 0) { + return v; + } if (v.size() > n) { v.resize(static_cast(n)); } @@ -129,48 +134,44 @@ coro::CoroTask StatisticsQueryUtility::process( } std::string StatisticsQueryOutput::to_json() const { - yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr); - yyjson_mut_val* root = yyjson_mut_obj(doc); - yyjson_mut_doc_set_root(doc, root); + std::ostringstream ss; + ss << std::setprecision(17); + ss << '{'; - yyjson_mut_obj_add_str(doc, root, "query_type", query_type_name.c_str()); - yyjson_mut_obj_add_uint(doc, root, "total_events", total_events); + ss << "\"query_type\":\"" << query_type_name << '"'; + ss << ",\"total_events\":" << total_events; if (!results.empty()) { - yyjson_mut_val* arr = yyjson_mut_arr(doc); + ss << ",\"results\":["; + bool first = true; for (const auto& [name, count] : results) { - yyjson_mut_val* item = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, item, "name", name.c_str()); - yyjson_mut_obj_add_uint(doc, item, "count", count); - yyjson_mut_arr_append(arr, item); + if (!first) ss << ','; + first = false; + ss << "{\"name\":\"" << name << "\",\"count\":" << count << '}'; } - yyjson_mut_obj_add_val(doc, root, "results", arr); + ss << ']'; } if (min_timestamp_us > 0 || max_timestamp_us > 0) { - yyjson_mut_val* tr = yyjson_mut_obj(doc); - yyjson_mut_obj_add_uint(doc, tr, "min_timestamp_us", min_timestamp_us); - yyjson_mut_obj_add_uint(doc, tr, "max_timestamp_us", max_timestamp_us); - yyjson_mut_obj_add_real(doc, tr, "time_span_seconds", - time_span_seconds); - yyjson_mut_obj_add_val(doc, root, "time_range", tr); + ss << ",\"time_range\":{"; + ss << "\"min_timestamp_us\":" << min_timestamp_us; + ss << ",\"max_timestamp_us\":" << max_timestamp_us; + ss << ",\"time_span_seconds\":" << time_span_seconds; + ss << '}'; } if (duration_count > 0) { - yyjson_mut_val* dur = yyjson_mut_obj(doc); - yyjson_mut_obj_add_uint(doc, dur, "count", duration_count); - yyjson_mut_obj_add_real(doc, dur, "mean_us", duration_mean_us); - yyjson_mut_obj_add_real(doc, dur, "stddev_us", duration_stddev_us); - yyjson_mut_obj_add_uint(doc, dur, "min_us", duration_min_us); - yyjson_mut_obj_add_uint(doc, dur, "max_us", duration_max_us); - yyjson_mut_obj_add_val(doc, root, "duration", dur); + ss << ",\"duration\":{"; + ss << "\"count\":" << duration_count; + ss << ",\"mean_us\":" << duration_mean_us; + ss << ",\"stddev_us\":" << duration_stddev_us; + ss << ",\"min_us\":" << duration_min_us; + ss << ",\"max_us\":" << duration_max_us; + ss << '}'; } - char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_PRETTY, nullptr); - std::string result(json_str ? json_str : "{}"); - if (json_str) free(json_str); - yyjson_mut_doc_free(doc); - return result; + ss << '}'; + return ss.str(); } } // namespace dftracer::utils::utilities::composites::dft::statistics diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp index 584c3a4c..59778407 100644 --- a/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp +++ b/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp @@ -1,7 +1,8 @@ #include -#include #include +#include +#include namespace dftracer::utils::utilities::composites::dft::statistics { @@ -40,80 +41,77 @@ std::size_t TraceStatistics::num_pid_tids() const { } namespace { -void add_counts_object( - yyjson_mut_doc* doc, yyjson_mut_val* parent, const char* key, - const std::unordered_map& m) { - yyjson_mut_val* obj = yyjson_mut_obj(doc); +template +void add_counts_object(std::ostringstream& ss, const char* key, const Map& m, + bool& first_field) { + if (!first_field) ss << ','; + first_field = false; + ss << '"' << key << "\":{"; + bool first = true; for (const auto& [k, v] : m) { - yyjson_mut_obj_add_uint(doc, obj, k.c_str(), v); + if (!first) ss << ','; + first = false; + ss << '"' << k << "\":" << v; } - yyjson_mut_obj_add_val(doc, parent, key, obj); + ss << '}'; } } // namespace std::string TraceStatistics::to_json() const { - yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr); - yyjson_mut_val* root = yyjson_mut_obj(doc); - yyjson_mut_doc_set_root(doc, root); + std::ostringstream ss; + ss << std::setprecision(17); + ss << '{'; - yyjson_mut_obj_add_str(doc, root, "file_path", file_path.c_str()); - yyjson_mut_obj_add_str(doc, root, "index_path", index_path.c_str()); - yyjson_mut_obj_add_bool(doc, root, "success", success); + ss << "\"file_path\":\"" << file_path << '"'; + ss << ",\"index_path\":\"" << index_path << '"'; + ss << ",\"success\":" << (success ? "true" : "false"); + ss << ",\"total_events\":" << total_events(); if (!success) { - yyjson_mut_obj_add_str(doc, root, "error", error_message.c_str()); + ss << ",\"error\":\"" << error_message << '"'; } else { - yyjson_mut_obj_add_uint(doc, root, "num_chunks", num_chunks); - yyjson_mut_obj_add_uint(doc, root, "total_events", total_events()); - yyjson_mut_obj_add_uint(doc, root, "num_categories", num_categories()); - yyjson_mut_obj_add_uint(doc, root, "num_unique_names", - num_unique_names()); - yyjson_mut_obj_add_uint(doc, root, "num_pid_tids", num_pid_tids()); + ss << ",\"num_chunks\":" << num_chunks; + ss << ",\"num_categories\":" << num_categories(); + ss << ",\"num_unique_names\":" << num_unique_names(); + ss << ",\"num_pid_tids\":" << num_pid_tids(); // Time range - yyjson_mut_val* time_range = yyjson_mut_obj(doc); + ss << ",\"time_range\":{"; if (merged.min_timestamp_us != std::numeric_limits::max()) { - yyjson_mut_obj_add_uint(doc, time_range, "min_timestamp_us", - merged.min_timestamp_us); - yyjson_mut_obj_add_uint(doc, time_range, "max_timestamp_us", - merged.max_timestamp_us); + ss << "\"min_timestamp_us\":" << merged.min_timestamp_us; + ss << ",\"max_timestamp_us\":" << merged.max_timestamp_us; + ss << ','; } - yyjson_mut_obj_add_real(doc, time_range, "time_span_seconds", - time_span_seconds()); - yyjson_mut_obj_add_val(doc, root, "time_range", time_range); + ss << "\"time_span_seconds\":" << time_span_seconds(); + ss << '}'; // Duration stats - yyjson_mut_val* duration = yyjson_mut_obj(doc); - yyjson_mut_obj_add_uint(doc, duration, "count", merged.duration_count); + ss << ",\"duration\":{"; + ss << "\"count\":" << merged.duration_count; if (merged.duration_count > 0) { - yyjson_mut_obj_add_int(doc, duration, "sum_us", - merged.duration_sum_us); - yyjson_mut_obj_add_real(doc, duration, "mean_us", - duration_mean_us()); - yyjson_mut_obj_add_real(doc, duration, "stddev_us", - duration_stddev_us()); + ss << ",\"sum_us\":" << merged.duration_sum_us; + ss << ",\"mean_us\":" << duration_mean_us(); + ss << ",\"stddev_us\":" << duration_stddev_us(); if (merged.duration_min_us != std::numeric_limits::max()) { - yyjson_mut_obj_add_uint(doc, duration, "min_us", - merged.duration_min_us); + ss << ",\"min_us\":" << merged.duration_min_us; } - yyjson_mut_obj_add_uint(doc, duration, "max_us", - merged.duration_max_us); + ss << ",\"max_us\":" << merged.duration_max_us; } - yyjson_mut_obj_add_val(doc, root, "duration", duration); + ss << '}'; // Count maps - add_counts_object(doc, root, "category_counts", merged.category_counts); - add_counts_object(doc, root, "name_counts", merged.name_counts); - add_counts_object(doc, root, "pid_tid_counts", merged.pid_tid_counts); + bool first_field = false; + add_counts_object(ss, "category_counts", merged.category_counts, + first_field); + add_counts_object(ss, "name_counts", merged.name_counts, first_field); + add_counts_object(ss, "pid_tid_counts", merged.pid_tid_counts, + first_field); } - char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_PRETTY, nullptr); - std::string result(json_str ? json_str : "{}"); - if (json_str) free(json_str); - yyjson_mut_doc_free(doc); - return result; + ss << '}'; + return ss.str(); } } // namespace dftracer::utils::utilities::composites::dft::statistics diff --git a/src/dftracer/utils/utilities/composites/dft/views/view_definition.cpp b/src/dftracer/utils/utilities/composites/dft/views/view_definition.cpp index ac132b15..476b2919 100644 --- a/src/dftracer/utils/utilities/composites/dft/views/view_definition.cpp +++ b/src/dftracer/utils/utilities/composites/dft/views/view_definition.cpp @@ -1,7 +1,8 @@ #include -#include +#include #include +#include #include namespace dftracer::utils::utilities::composites::dft::views { @@ -34,55 +35,99 @@ ViewDefinition& ViewDefinition::with_include_metadata(bool v) { return *this; } -std::string ViewDefinition::to_json() const { - yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr); - yyjson_mut_val* root = yyjson_mut_obj(doc); - yyjson_mut_doc_set_root(doc, root); +namespace { + +std::string escape_json_string(const std::string& s) { + std::string result; + result.reserve(s.size()); + for (char c : s) { + switch (c) { + case '"': + result += "\\\""; + break; + case '\\': + result += "\\\\"; + break; + case '\b': + result += "\\b"; + break; + case '\f': + result += "\\f"; + break; + case '\n': + result += "\\n"; + break; + case '\r': + result += "\\r"; + break; + case '\t': + result += "\\t"; + break; + default: + result += c; + break; + } + } + return result; +} - yyjson_mut_obj_add_str(doc, root, "name", name.c_str()); - yyjson_mut_obj_add_str(doc, root, "description", description.c_str()); +} // namespace + +std::string ViewDefinition::to_json() const { + std::ostringstream out; + out << "{\n"; + out << " \"name\": \"" << escape_json_string(name) << "\",\n"; + out << " \"description\": \"" << escape_json_string(description) << "\""; if (query) { - yyjson_mut_obj_add_str(doc, root, "query", query->source().c_str()); + out << ",\n \"query\": \"" << escape_json_string(query->source()) + << "\""; } - yyjson_mut_obj_add_bool(doc, root, "include_metadata", include_metadata); - - char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_PRETTY, nullptr); - std::string result(json_str); - free(json_str); - yyjson_mut_doc_free(doc); - return result; + out << ",\n \"include_metadata\": " + << (include_metadata ? "true" : "false") << "\n"; + out << "}"; + return out.str(); } ViewDefinition ViewDefinition::from_json(const std::string& json) { - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - yyjson_val* root = yyjson_doc_get_root(doc); - ViewDefinition view_def; - yyjson_val* name_val = yyjson_obj_get(root, "name"); - if (name_val && yyjson_is_str(name_val)) { - view_def.name = yyjson_get_str(name_val); + simdjson::dom::parser parser; + auto result = parser.parse(json); + if (result.error()) { + return view_def; + } + + auto root = result.value_unsafe(); + if (!root.is_object()) { + return view_def; + } + + auto name_result = root["name"]; + if (!name_result.error() && name_result.value_unsafe().is_string()) { + view_def.name = + std::string(name_result.value_unsafe().get_string().value()); } - yyjson_val* desc_val = yyjson_obj_get(root, "description"); - if (desc_val && yyjson_is_str(desc_val)) { - view_def.description = yyjson_get_str(desc_val); + auto desc_result = root["description"]; + if (!desc_result.error() && desc_result.value_unsafe().is_string()) { + view_def.description = + std::string(desc_result.value_unsafe().get_string().value()); } - yyjson_val* query_val = yyjson_obj_get(root, "query"); - if (query_val && yyjson_is_str(query_val)) { - view_def.with_query(yyjson_get_str(query_val)); + auto query_result = root["query"]; + if (!query_result.error() && query_result.value_unsafe().is_string()) { + view_def.with_query( + std::string(query_result.value_unsafe().get_string().value())); } - yyjson_val* meta_val = yyjson_obj_get(root, "include_metadata"); - if (meta_val && yyjson_is_bool(meta_val)) { - view_def.include_metadata = yyjson_get_bool(meta_val); + auto meta_result = root["include_metadata"]; + if (!meta_result.error() && meta_result.value_unsafe().is_bool()) { + view_def.include_metadata = + meta_result.value_unsafe().get_bool().value(); } - yyjson_doc_free(doc); return view_def; } diff --git a/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp b/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp index 5759ff4e..64403843 100644 --- a/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -134,10 +134,7 @@ coro::AsyncGenerator ViewReaderUtility::process( ViewReaderBatch batch; - // NOTE(perf): reusable yyjson allocator - char yy_buf[common::json::YYJSON_LINE_POOL_SIZE]; - yyjson_alc yy_alc; - yyjson_alc_pool_init(&yy_alc, yy_buf, sizeof(yy_buf)); + simdjson::dom::parser parser; while (!stream->done()) { auto chunk = co_await stream->read_async(); @@ -155,13 +152,10 @@ coro::AsyncGenerator ViewReaderUtility::process( std::size_t line_len = newline - line_start; if (line_len > 0) { - yyjson_doc* doc = - yyjson_read_opts(const_cast(line_start), line_len, - YYJSON_READ_NOFLAG, &yy_alc, nullptr); - - if (doc) { - yyjson_val* root = yyjson_doc_get_root(doc); - if (root && yyjson_is_obj(root)) { + auto result = parser.parse(line_start, line_len); + if (!result.error()) { + auto root = result.value_unsafe(); + if (root.is_object()) { JsonValue json(root); std::string_view ph = json["ph"].get(); @@ -206,7 +200,6 @@ coro::AsyncGenerator ViewReaderUtility::process( } } } - yyjson_doc_free(doc); } } @@ -241,69 +234,56 @@ using common::arrow::RecordBatchBuilder; ArrowExportResult ViewReaderBatch::to_arrow() const { RecordBatchBuilder builder; + return to_arrow(builder); +} + +ArrowExportResult ViewReaderBatch::to_arrow(RecordBatchBuilder& builder) const { builder.reserve(events.size()); - std::vector held_docs; std::vector held_serialized; + simdjson::dom::parser parser; for (const auto& event_str : events) { - yyjson_doc* doc = yyjson_read(event_str.data(), event_str.size(), 0); - if (!doc) continue; - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); - continue; - } - held_docs.push_back(doc); + auto result = parser.parse(event_str.data(), event_str.size()); + if (result.error()) continue; + auto elem = result.value_unsafe(); + if (!elem.is_object()) continue; + + auto obj_result = elem.get_object(); + if (obj_result.error()) continue; + auto obj = obj_result.value_unsafe(); - yyjson_obj_iter it; - yyjson_obj_iter_init(root, &it); - yyjson_val* key; - while ((key = yyjson_obj_iter_next(&it))) { - yyjson_val* val = yyjson_obj_iter_get_val(key); - std::string_view key_sv(yyjson_get_str(key), yyjson_get_len(key)); + for (auto field : obj) { + std::string_view key_sv = field.key; + auto val = field.value; - if (yyjson_is_int(val)) { + if (val.is_int64()) { auto ci = builder.add_or_get_column(key_sv, ColumnType::INT64); - builder.append_int64(ci, yyjson_get_sint(val)); - } else if (yyjson_is_uint(val)) { + builder.append_int64(ci, val.get_int64().value_unsafe()); + } else if (val.is_uint64()) { auto ci = builder.add_or_get_column(key_sv, ColumnType::UINT64); - builder.append_uint64(ci, yyjson_get_uint(val)); - } else if (yyjson_is_real(val)) { + builder.append_uint64(ci, val.get_uint64().value_unsafe()); + } else if (val.is_double()) { auto ci = builder.add_or_get_column(key_sv, ColumnType::DOUBLE); - builder.append_double(ci, yyjson_get_real(val)); - } else if (yyjson_is_bool(val)) { + builder.append_double(ci, val.get_double().value_unsafe()); + } else if (val.is_bool()) { auto ci = builder.add_or_get_column(key_sv, ColumnType::BOOL); - builder.append_bool(ci, yyjson_get_bool(val)); - } else if (yyjson_is_str(val)) { + builder.append_bool(ci, val.get_bool().value_unsafe()); + } else if (val.is_string()) { auto ci = builder.add_or_get_column(key_sv, ColumnType::STRING); - builder.append_string( - ci, - std::string_view(yyjson_get_str(val), yyjson_get_len(val))); - } else if (yyjson_is_null(val)) { - // Only append null to an existing column; skip if new — - // we don't know the type yet and STRING would corrupt later - // typed appends. + builder.append_string(ci, val.get_string().value_unsafe()); + } else if (val.is_null()) { auto existing = builder.find_column(key_sv); if (existing) builder.append_null(*existing); } else { auto ci = builder.add_or_get_column(key_sv, ColumnType::STRING); - std::size_t jlen; - char* js = yyjson_val_write(val, 0, &jlen); - if (js) { - held_serialized.emplace_back(js, jlen); - free(js); - builder.append_string(ci, held_serialized.back()); - } else { - builder.append_null(ci); - } + held_serialized.push_back(simdjson::minify(val)); + builder.append_string(ci, held_serialized.back()); } } builder.end_row(); } - auto result = builder.finish(); - for (auto* d : held_docs) yyjson_doc_free(d); - return result; + return builder.finish(); } } // namespace dftracer::utils::utilities::composites::dft::views diff --git a/src/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp new file mode 100644 index 00000000..b42975e6 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp @@ -0,0 +1,652 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using dftracer::utils::utilities::composites::dft::indexing::BloomFilter; +namespace dftracer::utils::utilities::composites::dft::visitors { + +namespace { + +constexpr std::string_view DIM_NAME = "name"; +constexpr std::string_view DIM_CAT = "cat"; +constexpr std::string_view DIM_PID = "pid"; +constexpr std::string_view DIM_TID = "tid"; +constexpr std::string_view DIM_PID_TID = "pid_tid"; +constexpr std::string_view DIM_HHASH = "hhash"; +constexpr std::string_view DIM_FHASH = "fhash"; +constexpr std::string_view DIM_SHASH = "shash"; +constexpr std::string_view DIM_TS = "ts"; +constexpr std::string_view DIM_DUR = "dur"; + +constexpr std::array + FIXED_BLOOM_NAMES = {DIM_NAME, DIM_CAT, DIM_PID, DIM_TID, + DIM_HHASH, DIM_FHASH, DIM_SHASH}; + +constexpr std::array FIXED_DIM_NAMES = + {DIM_NAME, DIM_CAT, DIM_PID, DIM_TID, DIM_PID_TID, + DIM_HHASH, DIM_FHASH, DIM_SHASH, DIM_TS, DIM_DUR}; + +int fixed_bloom_index(std::string_view name) { + for (std::size_t i = 0; i < FIXED_BLOOM_NAMES.size(); ++i) { + if (FIXED_BLOOM_NAMES[i] == name) return static_cast(i); + } + return -1; +} + +inline std::string_view dom_string(simdjson::dom::element obj, + std::string_view key) { + auto r = obj[key]; + if (r.error()) return {}; + auto v = r.value_unsafe(); + if (!v.is_string()) return {}; + return v.get_string().value_unsafe(); +} + +bool dom_value_to_string(simdjson::dom::element val, std::string& out) { + out.clear(); + if (val.is_string()) { + auto sv = val.get_string().value_unsafe(); + out.assign(sv.data(), sv.size()); + return !out.empty(); + } + char buf[32]; + if (val.is_uint64()) { + auto [p, _] = std::to_chars(buf, buf + sizeof(buf), + val.get_uint64().value_unsafe()); + out.assign(buf, p); + return true; + } + if (val.is_int64()) { + auto [p, _] = std::to_chars(buf, buf + sizeof(buf), + val.get_int64().value_unsafe()); + out.assign(buf, p); + return true; + } + if (val.is_double()) { + auto [p, _] = std::to_chars(buf, buf + sizeof(buf), + val.get_double().value_unsafe()); + out.assign(buf, p); + return true; + } + if (val.is_bool()) { + out = val.get_bool().value_unsafe() ? "true" : "false"; + return true; + } + return false; +} + +/// Emit bloom/stats/dimension records to a sink that might be either a +/// RocksDB-backed writer or an SST file emitter. Returns the accumulated +/// file-level statistics so downstream callers can use them for name +/// postings and root-summary refresh on the concrete writer. +BloomVisitor::ChunkStatistics persist_bloom_sink_writes( + indexer::IndexBatchSink& db, int file_id, + const std::vector& extra_dim_names, + const std::vector& chunks, + const BloomVisitor::ChunkIndexerConfig& config) { + BloomVisitor::ChunkStatistics file_statistics; + + // Accumulate file-level blooms per slot. + std::array file_fixed_blooms = { + BloomFilter(config.expected_entries_per_chunk, + config.false_positive_rate), + BloomFilter(config.expected_entries_per_chunk, + config.false_positive_rate), + BloomFilter(config.expected_entries_per_chunk, + config.false_positive_rate), + BloomFilter(config.expected_entries_per_chunk, + config.false_positive_rate), + BloomFilter(config.expected_entries_per_chunk, + config.false_positive_rate), + BloomFilter(config.expected_entries_per_chunk, + config.false_positive_rate), + BloomFilter(config.expected_entries_per_chunk, + config.false_positive_rate), + }; + std::vector file_extra_blooms; + file_extra_blooms.reserve(extra_dim_names.size()); + for (std::size_t i = 0; i < extra_dim_names.size(); ++i) { + file_extra_blooms.emplace_back(config.expected_entries_per_chunk, + config.false_positive_rate); + } + + std::vector blob; + + for (std::size_t i = 0; i < chunks.size(); ++i) { + const auto& chunk = chunks[i]; + auto checkpoint_idx = static_cast(i); + + // Fixed blooms + for (std::size_t b = 0; b < BloomVisitor::BF_COUNT; ++b) { + const BloomFilter& bf = chunk.fixed_blooms[b]; + bf.serialize_into(blob); + db.insert_chunk_bloom_filter( + file_id, checkpoint_idx, std::string(FIXED_BLOOM_NAMES[b]), + std::span(blob.data(), blob.size()), + static_cast(bf.num_entries())); + file_fixed_blooms[b].merge_from(bf); + } + // Extra blooms + for (std::size_t e = 0; + e < extra_dim_names.size() && e < chunk.extra_blooms.size(); ++e) { + const BloomFilter& bf = chunk.extra_blooms[e]; + bf.serialize_into(blob); + db.insert_chunk_bloom_filter( + file_id, checkpoint_idx, extra_dim_names[e], + std::span(blob.data(), blob.size()), + static_cast(bf.num_entries())); + file_extra_blooms[e].merge_from(bf); + } + + db.insert_chunk_statistics(file_id, checkpoint_idx, chunk.statistics); + file_statistics.merge_from(chunk.statistics); + + // Fixed dim_stats + for (std::size_t d = 0; d < BloomVisitor::FD_COUNT; ++d) { + db.insert_chunk_dimension_stats(file_id, checkpoint_idx, + chunk.fixed_dim_stats[d], + config.value_counts_cap); + } + // Extra dim_stats + for (const auto& ds : chunk.extra_dim_stats) { + db.insert_chunk_dimension_stats(file_id, checkpoint_idx, ds, + config.value_counts_cap); + } + } + + // File-level blooms + for (std::size_t b = 0; b < BloomVisitor::BF_COUNT; ++b) { + const BloomFilter& bf = file_fixed_blooms[b]; + bf.serialize_into(blob); + db.insert_file_bloom_filter( + file_id, std::string(FIXED_BLOOM_NAMES[b]), + std::span(blob.data(), blob.size()), + static_cast(bf.num_entries())); + } + for (std::size_t e = 0; e < extra_dim_names.size(); ++e) { + const BloomFilter& bf = file_extra_blooms[e]; + bf.serialize_into(blob); + db.insert_file_bloom_filter( + file_id, extra_dim_names[e], + std::span(blob.data(), blob.size()), + static_cast(bf.num_entries())); + } + + for (std::size_t b = 0; b < BloomVisitor::BF_COUNT; ++b) { + db.insert_index_dimension(file_id, std::string(FIXED_BLOOM_NAMES[b])); + } + for (const auto& dim : extra_dim_names) { + db.insert_index_dimension(file_id, dim); + } + db.insert_index_dimension(file_id, std::string(DIM_TS)); + db.insert_index_dimension(file_id, std::string(DIM_DUR)); + + db.insert_file_scalar_stats(file_id, file_statistics, chunks.size()); + db.insert_file_category_counts(file_id, file_statistics.category_counts); + db.insert_file_name_counts(file_id, file_statistics.name_counts); + db.insert_file_pid_tid_counts(file_id, file_statistics.pid_tid_counts); + + // Name dictionary + postings. name_id is a pure FNV1a hash of the name + // so this is safe on any sink backend (RocksDB or SST). The dictionary + // entries are idempotent; duplicate inserts across workers are folded + // together at ingest time via `ingest_behind=true`. + std::unordered_map file_name_ids; + file_name_ids.reserve(file_statistics.name_counts.size()); + for (const auto& [name, _] : file_statistics.name_counts) { + const auto name_id = hash::fnv1a_hash(name); + file_name_ids.emplace(name, name_id); + db.insert_name_dictionary_entry(name_id, name); + db.insert_name_file_posting(name_id, file_id); + } + + for (std::size_t i = 0; i < chunks.size(); ++i) { + const auto checkpoint_idx = static_cast(i); + const auto& chunk = chunks[i]; + for (const auto& [name, _] : chunk.statistics.name_counts) { + auto name_id_it = file_name_ids.find(name); + if (name_id_it != file_name_ids.end()) { + db.insert_name_chunk_posting(name_id_it->second, file_id, + checkpoint_idx); + } + } + } + + return file_statistics; +} + +/// Concrete-only tail: root-summary refresh. Requires a read-through +/// (`has_file_scalar_stats`) and writes to the ROOT_* column families, which +/// are not yet covered by the distributed SST path. +void persist_bloom_concrete_tail( + indexer::IndexDatabaseWriterContext& db, int file_id, + const BloomVisitor::ChunkStatistics& file_statistics, + std::size_t num_chunks, bool refresh_root_summaries) { + if (!refresh_root_summaries) return; + const bool had_existing_file_summary = db.has_file_scalar_stats(file_id); + db.refresh_root_summaries_after_file_write( + file_id, file_statistics, num_chunks, had_existing_file_summary); +} + +} // namespace + +BloomVisitor::ChunkState::ChunkState() = default; + +BloomVisitor::BloomVisitor(ChunkIndexerConfig config, + std::vector dimensions) { + config_ = std::move(config); + // `dimensions` historically includes fixed + extras. Extract extras only + // (anything not matching a fixed bloom slot). + for (auto& dim : dimensions) { + if (fixed_bloom_index(dim) < 0) { + extra_dim_names_.push_back(std::move(dim)); + } + } + // Also pick up config_.extra_dimensions (kept for backwards compat with + // callers that set extras there and pass only defaults in `dimensions`). + for (const auto& dim : config_.extra_dimensions) { + bool already = false; + for (const auto& e : extra_dim_names_) { + if (e == dim) { + already = true; + break; + } + } + if (!already && fixed_bloom_index(dim) < 0) { + extra_dim_names_.push_back(dim); + } + } +} + +void BloomVisitor::begin(std::size_t /*num_checkpoints*/) { + chunks_.clear(); + chunks_base_idx_ = 0; + file_acc_.extra_blooms.clear(); + file_acc_.statistics = ChunkStatistics{}; + file_acc_.num_chunks_emitted = 0; + file_acc_.initialized = false; +} + +void BloomVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {} + +void BloomVisitor::ensure_chunk(std::size_t checkpoint_idx) { + if (checkpoint_idx < chunks_base_idx_) return; + const std::size_t local = checkpoint_idx - chunks_base_idx_; + if (local < chunks_.size()) return; + auto old_size = chunks_.size(); + chunks_.resize(local + 1); + for (std::size_t i = old_size; i < chunks_.size(); ++i) { + auto& chunk = chunks_[i]; + // Initialize fixed blooms with configured params. + for (std::size_t b = 0; b < BF_COUNT; ++b) { + chunk.fixed_blooms[b] = + BloomFilter(config_.expected_entries_per_chunk, + config_.false_positive_rate); + } + // Initialize fixed dim_stats metadata. + for (std::size_t d = 0; d < FD_COUNT; ++d) { + auto& ds = chunk.fixed_dim_stats[d]; + ds.dimension = std::string(FIXED_DIM_NAMES[d]); + ds.value_type = + (d == FD_PID || d == FD_TID || d == FD_TS || d == FD_DUR) + ? "uint" + : "string"; + } + // Initialize extras. + chunk.extra_blooms.clear(); + chunk.extra_dim_stats.clear(); + chunk.extra_blooms.reserve(extra_dim_names_.size()); + chunk.extra_dim_stats.resize(extra_dim_names_.size()); + for (std::size_t e = 0; e < extra_dim_names_.size(); ++e) { + chunk.extra_blooms.emplace_back(config_.expected_entries_per_chunk, + config_.false_positive_rate); + chunk.extra_dim_stats[e].dimension = extra_dim_names_[e]; + chunk.extra_dim_stats[e].value_type = "string"; + } + } +} + +void BloomVisitor::on_event(const EventRecord& record) { + if (record.checkpoint_idx < chunks_base_idx_) return; + ensure_chunk(record.checkpoint_idx); + + const auto& ev = record.ev; + ChunkState& chunk = chunks_[record.checkpoint_idx - chunks_base_idx_]; + + if (ev.is_metadata()) { + if (record.has_args) { + std::string_view hash_val = dom_string(record.args_dom, "value"); + std::string_view resolved = dom_string(record.args_dom, "name"); + + if (!hash_val.empty() && !resolved.empty()) { + std::string_view dim; + if (ev.name == "HH") { + dim = DIM_HHASH; + } else if (ev.name == "FH") { + dim = DIM_FHASH; + } else if (ev.name == "SH") { + dim = DIM_SHASH; + } + if (!dim.empty()) { + // Outer StringViewMap: transparent find, emplace on miss. + auto outer_it = chunk.hash_resolutions.find(dim); + if (outer_it == chunk.hash_resolutions.end()) { + outer_it = chunk.hash_resolutions + .emplace(std::string(dim), + StringViewMap{}) + .first; + } + auto& inner = outer_it->second; + // Inner StringViewMap: find + emplace/update. + auto inner_it = inner.find(hash_val); + if (inner_it == inner.end()) { + inner.emplace(std::string(hash_val), + std::string(resolved)); + } else { + inner_it->second.assign(resolved.data(), + resolved.size()); + } + } + } + } + } else { + chunk.statistics.update_from_event(ev.name, ev.cat, ev.pid, ev.tid, + ev.ts, ev.dur); + + // Observe a fixed slot: adds to bloom (if bloom_idx >= 0) and to + // dim_stats. + auto observe_fixed = [&chunk](int bloom_idx, std::size_t dim_idx, + std::string_view val) { + if (val.empty()) return; + if (bloom_idx >= 0) { + chunk.fixed_blooms[bloom_idx].add(val); + } + chunk.fixed_dim_stats[dim_idx].observe(val); + }; + + observe_fixed(BF_NAME, FD_NAME, ev.name); + observe_fixed(BF_CAT, FD_CAT, ev.cat); + + if (ev.pid != last_pid_ || last_pid_len_ == 0) { + auto [pp, _1] = std::to_chars( + last_pid_buf_, last_pid_buf_ + sizeof(last_pid_buf_), ev.pid); + last_pid_len_ = static_cast(pp - last_pid_buf_); + last_pid_ = ev.pid; + } + if (ev.tid != last_tid_ || last_tid_len_ == 0) { + auto [tp, _2] = std::to_chars( + last_tid_buf_, last_tid_buf_ + sizeof(last_tid_buf_), ev.tid); + last_tid_len_ = static_cast(tp - last_tid_buf_); + last_tid_ = ev.tid; + } + std::string_view pid_sv(last_pid_buf_, last_pid_len_); + std::string_view tid_sv(last_tid_buf_, last_tid_len_); + + observe_fixed(BF_PID, FD_PID, pid_sv); + observe_fixed(BF_TID, FD_TID, tid_sv); + + char pt_buf[52]; + std::memcpy(pt_buf, last_pid_buf_, last_pid_len_); + pt_buf[last_pid_len_] = ':'; + std::memcpy(pt_buf + last_pid_len_ + 1, last_tid_buf_, last_tid_len_); + std::string_view pt_sv(pt_buf, last_pid_len_ + 1 + last_tid_len_); + // pid_tid has no bloom slot — only dim_stats. + observe_fixed(-1, FD_PID_TID, pt_sv); + + chunk.fixed_dim_stats[FD_TS].observe_range_only(ev.ts); + chunk.fixed_dim_stats[FD_DUR].observe_range_only(ev.dur); + + if (record.has_args) { + std::string_view hhash = dom_string(record.args_dom, "hhash"); + observe_fixed(BF_HHASH, FD_HHASH, hhash); + + std::string_view fhash = dom_string(record.args_dom, "fhash"); + observe_fixed(BF_FHASH, FD_FHASH, fhash); + + std::string_view shash = dom_string(record.args_dom, "cmd_hash"); + if (shash.empty()) { + shash = dom_string(record.args_dom, "exec_hash"); + } + observe_fixed(BF_SHASH, FD_SHASH, shash); + + std::string scratch; + for (std::size_t e = 0; e < extra_dim_names_.size(); ++e) { + auto r = record.args_dom[extra_dim_names_[e]]; + if (r.error()) continue; + if (dom_value_to_string(r.value_unsafe(), scratch) && + !scratch.empty()) { + chunk.extra_blooms[e].add(scratch); + chunk.extra_dim_stats[e].observe(scratch); + } + } + } + + chunk.events_processed++; + } +} + +std::unique_ptr BloomVisitor::create_parallel_slice() const { + std::vector dims; + dims.reserve(BloomVisitor::BF_COUNT + extra_dim_names_.size()); + for (auto sv : FIXED_BLOOM_NAMES) dims.emplace_back(sv); + for (const auto& d : extra_dim_names_) dims.push_back(d); + return std::make_unique(config_, std::move(dims)); +} + +void BloomVisitor::merge_parallel_slice(DftEventVisitor& slice_base) { + auto* slice = dynamic_cast(&slice_base); + if (!slice) return; + + for (std::size_t slice_i = chunks_base_idx_; + slice_i < slice->chunks_.size(); ++slice_i) { + auto& src = slice->chunks_[slice_i]; + if (src.events_processed == 0) continue; + const std::size_t parent_local = slice_i - chunks_base_idx_; + ensure_chunk(slice_i); + auto& dst = chunks_[parent_local]; + + for (std::size_t b = 0; b < BF_COUNT; ++b) { + dst.fixed_blooms[b].merge_from(src.fixed_blooms[b]); + } + for (std::size_t e = 0; + e < src.extra_blooms.size() && e < dst.extra_blooms.size(); ++e) { + dst.extra_blooms[e].merge_from(src.extra_blooms[e]); + } + + for (std::size_t d = 0; d < FD_COUNT; ++d) { + auto& sds = src.fixed_dim_stats[d]; + auto& dds = dst.fixed_dim_stats[d]; + if (sds.value_counts) { + if (!dds.value_counts) dds.value_counts.emplace(); + for (const auto& [k, v] : *sds.value_counts) { + (*dds.value_counts)[k] += v; + } + dds.distinct_count = dds.value_counts->size(); + } + if (dds.min_value.empty() || + (!sds.min_value.empty() && sds.min_value < dds.min_value)) { + dds.min_value = sds.min_value; + } + if (sds.max_value > dds.max_value) { + dds.max_value = sds.max_value; + } + } + for (std::size_t e = 0; + e < src.extra_dim_stats.size() && e < dst.extra_dim_stats.size(); + ++e) { + auto& sds = src.extra_dim_stats[e]; + auto& dds = dst.extra_dim_stats[e]; + if (sds.value_counts) { + if (!dds.value_counts) dds.value_counts.emplace(); + for (const auto& [k, v] : *sds.value_counts) { + (*dds.value_counts)[k] += v; + } + dds.distinct_count = dds.value_counts->size(); + } + if (dds.min_value.empty() || + (!sds.min_value.empty() && sds.min_value < dds.min_value)) { + dds.min_value = sds.min_value; + } + if (sds.max_value > dds.max_value) { + dds.max_value = sds.max_value; + } + } + + dst.statistics.merge_from(src.statistics); + + for (auto& [dim, inner] : src.hash_resolutions) { + auto outer_it = dst.hash_resolutions.find(dim); + if (outer_it == dst.hash_resolutions.end()) { + dst.hash_resolutions.emplace(dim, std::move(inner)); + } else { + for (auto& [k, v] : inner) { + outer_it->second.try_emplace(k, std::move(v)); + } + } + } + + dst.events_processed += src.events_processed; + } +} + +void BloomVisitor::finalize(indexer::IndexDatabaseWriterContext& db, + int file_id) { + auto file_statistics = persist_bloom_sink_writes( + db, file_id, extra_dim_names_, chunks_, config_); + persist_bloom_concrete_tail(db, file_id, file_statistics, chunks_.size(), + /*refresh_root_summaries=*/true); +} + +void BloomVisitor::finalize_sink_only(indexer::IndexBatchSink& sink, + int file_id) { + flush_per_checkpoint_to_sink(sink, file_id); + finalize_file_to_sink(sink, file_id); +} + +void BloomVisitor::flush_per_checkpoint_to_sink(indexer::IndexBatchSink& sink, + int file_id) { + if (chunks_.empty()) return; + + if (!file_acc_.initialized) { + for (std::size_t b = 0; b < BF_COUNT; ++b) { + file_acc_.fixed_blooms[b] = + BloomFilter(config_.expected_entries_per_chunk, + config_.false_positive_rate); + } + file_acc_.extra_blooms.reserve(extra_dim_names_.size()); + for (std::size_t e = 0; e < extra_dim_names_.size(); ++e) { + file_acc_.extra_blooms.emplace_back( + config_.expected_entries_per_chunk, + config_.false_positive_rate); + } + file_acc_.initialized = true; + } + + std::vector blob; + for (std::size_t i = 0; i < chunks_.size(); ++i) { + const auto& chunk = chunks_[i]; + const auto checkpoint_idx = + static_cast(chunks_base_idx_ + i); + + for (std::size_t b = 0; b < BF_COUNT; ++b) { + const BloomFilter& bf = chunk.fixed_blooms[b]; + bf.serialize_into(blob); + sink.insert_chunk_bloom_filter( + file_id, checkpoint_idx, std::string(FIXED_BLOOM_NAMES[b]), + std::span(blob.data(), blob.size()), + static_cast(bf.num_entries())); + file_acc_.fixed_blooms[b].merge_from(bf); + } + for (std::size_t e = 0; + e < extra_dim_names_.size() && e < chunk.extra_blooms.size(); + ++e) { + const BloomFilter& bf = chunk.extra_blooms[e]; + bf.serialize_into(blob); + sink.insert_chunk_bloom_filter( + file_id, checkpoint_idx, extra_dim_names_[e], + std::span(blob.data(), blob.size()), + static_cast(bf.num_entries())); + file_acc_.extra_blooms[e].merge_from(bf); + } + + sink.insert_chunk_statistics(file_id, checkpoint_idx, chunk.statistics); + file_acc_.statistics.merge_from(chunk.statistics); + + for (std::size_t d = 0; d < FD_COUNT; ++d) { + sink.insert_chunk_dimension_stats(file_id, checkpoint_idx, + chunk.fixed_dim_stats[d], + config_.value_counts_cap); + } + for (const auto& ds : chunk.extra_dim_stats) { + sink.insert_chunk_dimension_stats(file_id, checkpoint_idx, ds, + config_.value_counts_cap); + } + + for (const auto& [name, _] : chunk.statistics.name_counts) { + const auto name_id = hash::fnv1a_hash(name); + sink.insert_name_chunk_posting(name_id, file_id, checkpoint_idx); + } + } + + file_acc_.num_chunks_emitted += chunks_.size(); + chunks_base_idx_ += chunks_.size(); + chunks_.clear(); +} + +void BloomVisitor::finalize_file_to_sink(indexer::IndexBatchSink& sink, + int file_id) { + if (!file_acc_.initialized && chunks_.empty()) return; + flush_per_checkpoint_to_sink(sink, file_id); + + std::vector blob; + for (std::size_t b = 0; b < BF_COUNT; ++b) { + const BloomFilter& bf = file_acc_.fixed_blooms[b]; + bf.serialize_into(blob); + sink.insert_file_bloom_filter( + file_id, std::string(FIXED_BLOOM_NAMES[b]), + std::span(blob.data(), blob.size()), + static_cast(bf.num_entries())); + } + for (std::size_t e = 0; e < extra_dim_names_.size(); ++e) { + const BloomFilter& bf = file_acc_.extra_blooms[e]; + bf.serialize_into(blob); + sink.insert_file_bloom_filter( + file_id, extra_dim_names_[e], + std::span(blob.data(), blob.size()), + static_cast(bf.num_entries())); + } + + for (std::size_t b = 0; b < BF_COUNT; ++b) { + sink.insert_index_dimension(file_id, std::string(FIXED_BLOOM_NAMES[b])); + } + for (const auto& dim : extra_dim_names_) { + sink.insert_index_dimension(file_id, dim); + } + sink.insert_index_dimension(file_id, std::string(DIM_TS)); + sink.insert_index_dimension(file_id, std::string(DIM_DUR)); + + sink.insert_file_scalar_stats(file_id, file_acc_.statistics, + file_acc_.num_chunks_emitted); + sink.insert_file_category_counts(file_id, + file_acc_.statistics.category_counts); + sink.insert_file_name_counts(file_id, file_acc_.statistics.name_counts); + sink.insert_file_pid_tid_counts(file_id, + file_acc_.statistics.pid_tid_counts); + + for (const auto& [name, _] : file_acc_.statistics.name_counts) { + const auto name_id = hash::fnv1a_hash(name); + sink.insert_name_dictionary_entry(name_id, name); + sink.insert_name_file_posting(name_id, file_id); + } +} + +} // namespace dftracer::utils::utilities::composites::dft::visitors diff --git a/src/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp new file mode 100644 index 00000000..e0a14297 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp @@ -0,0 +1,96 @@ +#include +#include + +namespace dftracer::utils::utilities::composites::dft::visitors { + +void HashTableVisitor::begin(std::size_t /*num_checkpoints*/) { + file_hashes_.clear(); + host_hashes_.clear(); + string_hashes_.clear(); + proc_metadata_.clear(); +} + +void HashTableVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {} + +void HashTableVisitor::on_event(const EventRecord& record) { + const auto& ev = record.ev; + if (!ev.is_metadata()) { + return; + } + if (!record.has_args) { + return; + } + + auto dom_string = [](simdjson::dom::element obj, + std::string_view key) -> std::string_view { + auto r = obj[key]; + if (r.error()) return {}; + auto v = r.value_unsafe(); + if (!v.is_string()) return {}; + return v.get_string().value_unsafe(); + }; + + auto name_val = dom_string(record.args_dom, "name"); + auto hash_val = dom_string(record.args_dom, "value"); + + if (name_val.empty() || hash_val.empty()) { + return; + } + + if (ev.name == "FH") { + file_hashes_.try_emplace(std::string(hash_val), std::string(name_val)); + } else if (ev.name == "HH") { + host_hashes_.try_emplace(std::string(hash_val), std::string(name_val)); + } else if (ev.name == "SH") { + string_hashes_.try_emplace(std::string(hash_val), + std::string(name_val)); + } else if (ev.name == "PR") { + proc_metadata_.try_emplace(std::string(hash_val), + std::string(name_val)); + } +} + +std::unique_ptr HashTableVisitor::create_parallel_slice() + const { + return std::make_unique(); +} + +void HashTableVisitor::merge_parallel_slice(DftEventVisitor& slice_base) { + auto* slice = dynamic_cast(&slice_base); + if (!slice) return; + auto absorb = [](std::unordered_map& dst, + std::unordered_map& src) { + for (auto& [k, v] : src) { + dst.try_emplace(std::move(const_cast(k)), + std::move(v)); + } + }; + absorb(file_hashes_, slice->file_hashes_); + absorb(host_hashes_, slice->host_hashes_); + absorb(string_hashes_, slice->string_hashes_); + absorb(proc_metadata_, slice->proc_metadata_); +} + +void HashTableVisitor::finalize(indexer::IndexBatchSink& writer, + int /*file_id*/) { + auto write_entries = + [&writer](const std::unordered_map& entries, + HashType type) { + for (const auto& [hash, name] : entries) { + writer.insert_hash_table_entry(static_cast(type), + hash, name); + } + }; + + write_entries(file_hashes_, HashType::FILE); + write_entries(host_hashes_, HashType::HOST); + write_entries(string_hashes_, HashType::STRING); + write_entries(proc_metadata_, HashType::PROC); +} + +std::size_t HashTableVisitor::num_entries() const { + return file_hashes_.size() + host_hashes_.size() + string_hashes_.size() + + proc_metadata_.size(); +} + +} // namespace dftracer::utils::utilities::composites::dft::visitors diff --git a/src/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp new file mode 100644 index 00000000..1a92b3b9 --- /dev/null +++ b/src/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp @@ -0,0 +1,128 @@ +#include +#include +#include +#include + +namespace dftracer::utils::utilities::composites::dft::visitors { + +void ManifestVisitor::begin(std::size_t /*num_checkpoints*/) { + event_lines_.clear(); + metadata_lines_.clear(); + observed_pids_.clear(); + event_count_ = 0; + line_offset_ = 0; + base_idx_ = 0; +} + +void ManifestVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {} + +void ManifestVisitor::ensure_chunk(std::size_t checkpoint_idx) { + if (checkpoint_idx < base_idx_) return; + const std::size_t local = checkpoint_idx - base_idx_; + if (local < event_lines_.size()) return; + event_lines_.resize(local + 1); + metadata_lines_.resize(local + 1); +} + +void ManifestVisitor::on_event(const EventRecord& record) { + if (record.checkpoint_idx < base_idx_) return; + auto ln = static_cast(record.line_number); + ensure_chunk(record.checkpoint_idx); + ++event_count_; + + const auto local = record.checkpoint_idx - base_idx_; + const auto& ev = record.ev; + if (ev.is_metadata()) { + std::string name(ev.name); + if (!name.empty()) { + metadata_lines_[local][name].push_back(ln); + } + } else { + std::string cat(ev.cat); + std::string name(ev.name); + event_lines_[local][{cat, name}].push_back(ln); + observed_pids_.insert(ev.pid); + } +} + +std::unique_ptr ManifestVisitor::create_parallel_slice() + const { + return std::make_unique(); +} + +void ManifestVisitor::merge_parallel_slice(DftEventVisitor& slice_base) { + auto* slice = dynamic_cast(&slice_base); + if (!slice) return; + const auto offset = static_cast(slice->line_offset_); + + auto map_ci = [this](std::size_t slice_ci) -> std::size_t { + return slice_ci - base_idx_; + }; + for (std::size_t slice_ci = base_idx_; + slice_ci < slice->event_lines_.size(); ++slice_ci) { + if (slice->event_lines_[slice_ci].empty()) continue; + const std::size_t parent_local = map_ci(slice_ci); + if (parent_local >= event_lines_.size()) { + event_lines_.resize(parent_local + 1); + } + for (auto& [key, lines] : slice->event_lines_[slice_ci]) { + auto& dst = event_lines_[parent_local][key]; + dst.reserve(dst.size() + lines.size()); + for (auto ln : lines) dst.push_back(ln + offset); + } + } + for (std::size_t slice_ci = base_idx_; + slice_ci < slice->metadata_lines_.size(); ++slice_ci) { + if (slice->metadata_lines_[slice_ci].empty()) continue; + const std::size_t parent_local = map_ci(slice_ci); + if (parent_local >= metadata_lines_.size()) { + metadata_lines_.resize(parent_local + 1); + } + for (auto& [meta_type, lines] : slice->metadata_lines_[slice_ci]) { + auto& dst = metadata_lines_[parent_local][meta_type]; + dst.reserve(dst.size() + lines.size()); + for (auto ln : lines) dst.push_back(ln + offset); + } + } + for (auto pid : slice->observed_pids_) observed_pids_.insert(pid); + event_count_ += slice->event_count_; +} + +void ManifestVisitor::finalize(indexer::IndexBatchSink& db, int file_id) { + flush_per_checkpoint_to_sink(db, file_id); + finalize_file_to_sink(db, file_id); +} + +void ManifestVisitor::flush_per_checkpoint_to_sink( + indexer::IndexBatchSink& sink, int file_id) { + const std::size_t n = std::max(event_lines_.size(), metadata_lines_.size()); + for (std::size_t i = 0; i < n; ++i) { + const auto ci = static_cast(base_idx_ + i); + if (i < event_lines_.size()) { + for (auto& [key, lines] : event_lines_[i]) { + if (lines.empty()) continue; + sink.insert_event_range(file_id, ci, key.first, key.second, + lines); + } + } + if (i < metadata_lines_.size()) { + for (auto& [meta_type, lines] : metadata_lines_[i]) { + if (lines.empty()) continue; + sink.insert_metadata_lines(file_id, ci, meta_type, lines); + } + } + } + base_idx_ += n; + event_lines_.clear(); + metadata_lines_.clear(); +} + +void ManifestVisitor::finalize_file_to_sink(indexer::IndexBatchSink& sink, + int file_id) { + flush_per_checkpoint_to_sink(sink, file_id); + if (!observed_pids_.empty()) { + sink.insert_file_pids(file_id, observed_pids_); + } +} + +} // namespace dftracer::utils::utilities::composites::dft::visitors diff --git a/src/dftracer/utils/utilities/composites/streaming_file_merger_utility.cpp b/src/dftracer/utils/utilities/composites/streaming_file_merger_utility.cpp index 042f9357..8b257ada 100644 --- a/src/dftracer/utils/utilities/composites/streaming_file_merger_utility.cpp +++ b/src/dftracer/utils/utilities/composites/streaming_file_merger_utility.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -12,15 +13,7 @@ namespace dftracer::utils::utilities::composites { namespace { -// FNV-1a hash for byte-level verification -inline std::size_t fnv1a_line(const char* data, std::size_t len) { - std::size_t h = 14695981039346656037ULL; - for (std::size_t i = 0; i < len; ++i) { - h ^= static_cast(static_cast(data[i])); - h *= 1099511628211ULL; - } - return h; -} +namespace hash = dftracer::utils::utilities::hash; // Check if a line is an array delimiter ([ or ]) after trimming whitespace. inline bool is_array_delimiter(const char* data, std::size_t len) { @@ -102,7 +95,7 @@ StreamingFileProducerUtility::process_async( ++result.events_sent; if (input.verify) { - batch_hash += fnv1a_line(trimmed, trimmed_length); + batch_hash += hash::fnv1a_hash(trimmed, trimmed_length); } if (local_buf.size() >= batch_budget) { diff --git a/src/dftracer/utils/utilities/fileio/chunk_writer.cpp b/src/dftracer/utils/utilities/fileio/chunk_writer.cpp index 5a0f8d83..4ed91d08 100644 --- a/src/dftracer/utils/utilities/fileio/chunk_writer.cpp +++ b/src/dftracer/utils/utilities/fileio/chunk_writer.cpp @@ -1,10 +1,14 @@ #include #include #include +#include +#include #include #include #include +#include + namespace dftracer::utils::utilities::fileio { ChunkWriter::ChunkWriter(ChunkWriterConfig config) @@ -77,6 +81,13 @@ coro::CoroTask ChunkWriter::write_line(ByteView line) { chunk_index_++; co_await open_next_chunk(); } + + // Yield every 256 events to prevent stack overflow from synchronous + // coroutine completion chains. This is internal to ChunkWriter so + // callers don't need to manage yielding. + if ((total_events_ & 0xff) == 0) { + co_await coro::yield(); + } } coro::CoroTask ChunkWriter::write_bytes(ByteView data) { @@ -93,11 +104,17 @@ coro::CoroTask ChunkWriter::flush_buffer() { if (write_buffer_.empty()) co_return; if (compressor_) { - auto gen = compressor_->compress( - ByteView(write_buffer_.data(), write_buffer_.size())); - while (auto chunk = co_await gen.next()) { - co_await io::write(fd_, chunk->as(), chunk->size()); - total_bytes_ += chunk->size(); + using GenType = coro::AsyncGenerator; + auto gen = std::make_unique(compressor_->compress( + ByteView(write_buffer_.data(), write_buffer_.size()))); + while (true) { + auto chunk = co_await gen->next(); + if (!chunk) break; + const char* data = chunk->as(); + std::size_t size = chunk->size(); + chunk.reset(); + co_await io::write(fd_, data, size); + total_bytes_ += size; } } else { co_await io::write(fd_, write_buffer_.data(), write_buffer_.size()); @@ -108,10 +125,17 @@ coro::CoroTask ChunkWriter::flush_buffer() { coro::CoroTask ChunkWriter::flush_raw(const char* data, std::size_t len) { if (compressor_) { - auto gen = compressor_->compress(ByteView(data, len)); - while (auto chunk = co_await gen.next()) { - co_await io::write(fd_, chunk->as(), chunk->size()); - total_bytes_ += chunk->size(); + using GenType = coro::AsyncGenerator; + auto gen = std::make_unique( + compressor_->compress(ByteView(data, len))); + while (true) { + auto chunk = co_await gen->next(); + if (!chunk) break; + const char* cdata = chunk->as(); + std::size_t csize = chunk->size(); + chunk.reset(); + co_await io::write(fd_, cdata, csize); + total_bytes_ += csize; } } else { co_await io::write(fd_, data, len); @@ -131,10 +155,16 @@ coro::CoroTask ChunkWriter::finalize_current_chunk() { } if (compressor_) { - auto fin = compressor_->finalize_stream(); - while (auto chunk = co_await fin.next()) { - co_await io::write(fd_, chunk->as(), chunk->size()); - total_bytes_ += chunk->size(); + using GenType = coro::AsyncGenerator; + auto fin = std::make_unique(compressor_->finalize_stream()); + while (true) { + auto chunk = co_await fin->next(); + if (!chunk) break; + const char* data = chunk->as(); + std::size_t size = chunk->size(); + chunk.reset(); + co_await io::write(fd_, data, size); + total_bytes_ += size; } compressor_.reset(); } @@ -142,12 +172,18 @@ coro::CoroTask ChunkWriter::finalize_current_chunk() { co_await io::close(fd_); fd_ = -1; + auto path = chunk_path(chunk_index_); chunks_.push_back(ChunkInfo{ - .path = chunk_path(chunk_index_), + .path = path, .bytes_written = current_chunk_bytes_, .events_written = current_chunk_events_, .chunk_index = chunk_index_, }); + + if (config_.on_chunk_complete) { + config_.on_chunk_complete(static_cast(chunk_index_), path, + current_chunk_events_, current_chunk_bytes_); + } } coro::CoroTask ChunkWriter::close() { diff --git a/src/dftracer/utils/utilities/fileio/parallel/layout.cpp b/src/dftracer/utils/utilities/fileio/parallel/layout.cpp new file mode 100644 index 00000000..ab1c3c73 --- /dev/null +++ b/src/dftracer/utils/utilities/fileio/parallel/layout.cpp @@ -0,0 +1,148 @@ +#include +#include +#include + +#ifdef __linux__ +#include +#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ + defined(__NetBSD__) +#include +#include + +#include +#endif + +#include +#include +#include +#include + +#ifdef DFTRACER_UTILS_HAVE_LUSTREAPI +#include +#endif + +namespace dftracer::utils::utilities::fileio::parallel { + +namespace { + +#ifdef __linux__ +// From linux/magic.h; inlined to avoid a hard kernel-header dep. +constexpr unsigned long NFS_MAGIC = 0x6969; +constexpr unsigned long LUSTRE_MAGIC = 0x0BD00BD0; +constexpr unsigned long GPFS_MAGIC = 0x47504653; // "GPFS" +constexpr unsigned long BEEGFS_MAGIC = 0x19830326; + +FilesystemKind classify_magic(unsigned long magic) noexcept { + switch (magic) { + case NFS_MAGIC: + return FilesystemKind::NFS; + case LUSTRE_MAGIC: + return FilesystemKind::LUSTRE; + case GPFS_MAGIC: + return FilesystemKind::GPFS; + case BEEGFS_MAGIC: + return FilesystemKind::BEEGFS; + default: + return FilesystemKind::LOCAL; + } +} +#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ + defined(__NetBSD__) +FilesystemKind classify_fstype(const char* fstype) noexcept { + if (fstype == nullptr) return FilesystemKind::LOCAL; + const std::string_view name(fstype); + if (name == "nfs") return FilesystemKind::NFS; + if (name == "lustre") return FilesystemKind::LUSTRE; + if (name == "gpfs") return FilesystemKind::GPFS; + if (name == "beegfs") return FilesystemKind::BEEGFS; + return FilesystemKind::LOCAL; +} +#endif + +std::string probe_path(const std::string& path) noexcept { + std::error_code ec; + if (fs::exists(path, ec)) return path; + auto parent = fs::path(path).parent_path(); + if (parent.empty()) return std::string("."); + if (fs::exists(parent, ec)) return parent.string(); + return std::string("."); +} + +void query_lustre_stripe(const std::string& probe, LayoutInfo& info) noexcept { +#ifdef DFTRACER_UTILS_HAVE_LUSTREAPI + // When the target file does not exist yet we fall back to the parent dir; + // the file inherits the directory's default stripe on creation. + const std::size_t lum_size = + sizeof(struct lov_user_md) + + LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data_v1); + auto* raw = std::calloc(1, lum_size); + if (!raw) return; + auto* lum = reinterpret_cast(raw); + lum->lmm_magic = LOV_USER_MAGIC; + if (llapi_file_get_stripe(probe.c_str(), lum) == 0) { + info.stripe_size = static_cast(lum->lmm_stripe_size); + info.stripe_count = static_cast(lum->lmm_stripe_count); + } + std::free(raw); +#else + (void)probe; + (void)info; +#endif +} + +} // namespace + +LayoutInfo detect_layout(const std::string& path) noexcept { + LayoutInfo info{}; + info.layout = FileLayout::STRIPED; + info.fs = FilesystemKind::UNKNOWN; + info.stripe_size = 0; + info.stripe_count = 0; + + const auto target = probe_path(path); +#if defined(__linux__) + struct statfs st{}; + if (::statfs(target.c_str(), &st) != 0) { + return info; + } + info.fs = classify_magic(static_cast(st.f_type)); +#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ + defined(__NetBSD__) + struct statfs st{}; + if (::statfs(target.c_str(), &st) != 0) { + return info; + } + info.fs = classify_fstype(st.f_fstypename); +#else + (void)target; +#endif + if (info.fs == FilesystemKind::NFS) { + info.layout = FileLayout::SHARDED; + } + if (info.fs == FilesystemKind::LUSTRE) { + query_lustre_stripe(target, info); + } + return info; +} + +WriterSizing compute_writer_sizing(const LayoutInfo& info, + std::size_t baseline_workers, + std::size_t default_flush_bytes, + std::size_t buffer_headroom_bytes, + bool padded_layout) noexcept { + WriterSizing s{}; + s.num_workers = baseline_workers == 0 ? 1 : baseline_workers; + if (!padded_layout && info.stripe_count > 0) { + s.num_workers = std::min(s.num_workers, info.stripe_count); + } + if (padded_layout && info.stripe_size > 0) { + // Uncompressed flush sized to one stripe; compressed fits easily. + s.flush_threshold = info.stripe_size; + } else { + s.flush_threshold = std::max(default_flush_bytes, info.stripe_size); + } + s.buffer_capacity = s.flush_threshold + buffer_headroom_bytes; + return s; +} + +} // namespace dftracer::utils::utilities::fileio::parallel diff --git a/src/dftracer/utils/utilities/fileio/parallel/merge.cpp b/src/dftracer/utils/utilities/fileio/parallel/merge.cpp new file mode 100644 index 00000000..d4d733c0 --- /dev/null +++ b/src/dftracer/utils/utilities/fileio/parallel/merge.cpp @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include + +#include + +namespace dftracer::utils::utilities::fileio::parallel { + +namespace { + +constexpr std::size_t COPY_BUFFER_BYTES = 256 * 1024; + +coro::CoroTask stream_shard_to_fd(int out_fd, const std::string& shard) { + ssize_t in_fd = + co_await ::dftracer::utils::io::open(shard.c_str(), O_RDONLY, 0); + if (in_fd < 0) { + DFTRACER_UTILS_LOG_ERROR("merge_shards: failed to open shard: %s", + shard.c_str()); + co_return -1; + } + std::vector buf(COPY_BUFFER_BYTES); + while (true) { + auto n = co_await ::dftracer::utils::io::read(static_cast(in_fd), + buf.data(), buf.size()); + if (n == 0) break; + if (n < 0) { + co_await ::dftracer::utils::io::close(static_cast(in_fd)); + DFTRACER_UTILS_LOG_ERROR("merge_shards: read failed on %s", + shard.c_str()); + co_return -1; + } + std::size_t remaining = static_cast(n); + const char* ptr = buf.data(); + while (remaining > 0) { + auto w = + co_await ::dftracer::utils::io::write(out_fd, ptr, remaining); + if (w <= 0) { + co_await ::dftracer::utils::io::close(static_cast(in_fd)); + DFTRACER_UTILS_LOG_ERROR( + "merge_shards: write failed while draining %s", + shard.c_str()); + co_return -1; + } + remaining -= static_cast(w); + ptr += w; + } + } + co_await ::dftracer::utils::io::close(static_cast(in_fd)); + co_return 0; +} + +} // namespace + +coro::CoroTask merge_shards(const std::string& target, + const std::vector& shards) { + if (shards.empty()) co_return 0; + + ssize_t out_fd = co_await ::dftracer::utils::io::open( + target.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (out_fd < 0) { + DFTRACER_UTILS_LOG_ERROR("merge_shards: failed to open target: %s", + target.c_str()); + co_return -1; + } + + for (const auto& shard : shards) { + if (co_await stream_shard_to_fd(static_cast(out_fd), shard) != 0) { + co_await ::dftracer::utils::io::close(static_cast(out_fd)); + co_return -1; + } + } + + co_await ::dftracer::utils::io::close(static_cast(out_fd)); + + for (const auto& shard : shards) { + ::unlink(shard.c_str()); + } + co_return 0; +} + +} // namespace dftracer::utils::utilities::fileio::parallel diff --git a/src/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp b/src/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp new file mode 100644 index 00000000..8f420285 --- /dev/null +++ b/src/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp @@ -0,0 +1,328 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::fileio::parallel { + +namespace { + +// FEXTRA-only gzip padding member layout (RFC 1952): +// hdr(10) + xlen(2) + xlen FEXTRA bytes + empty_stored_block(5) + trailer(8) +constexpr std::size_t PAD_MEMBER_FIXED_OVERHEAD = 25; +constexpr std::size_t PAD_MEMBER_MAX_SIZE = PAD_MEMBER_FIXED_OVERHEAD + 65535; + +// Worst-case channel depth. One per worker is usually enough; we allow a +// little slack so bursts don't block producers. +constexpr std::size_t CHUNK_CHANNEL_CAPACITY = 64; + +// Bound on concurrent stripe pwrites. Each stripe routes to a different OST +// (offset = (idx+1) * stripe_size, OST = (idx+1) % stripe_count), so this +// keeps multiple OSTs busy without unbounded outstanding I/O. +constexpr std::size_t MAX_INFLIGHT_PWRITES = 16; + +// Append a FEXTRA-only padding member (decompresses to zero bytes). +void append_padding_member(std::vector& out, std::uint16_t xlen) { + const std::size_t start = out.size(); + out.resize(start + PAD_MEMBER_FIXED_OVERHEAD + xlen); + std::uint8_t* p = out.data() + start; + + p[0] = 0x1f; + p[1] = 0x8b; + p[2] = 0x08; // CM = deflate + p[3] = 0x04; // FLG = FEXTRA + p[4] = p[5] = p[6] = p[7] = 0; // MTIME + p[8] = 0; // XFL + p[9] = 0xff; // OS = unknown + + p[10] = static_cast(xlen & 0xff); + p[11] = static_cast((xlen >> 8) & 0xff); + std::memset(p + 12, 0, xlen); + + // Empty deflate stored block: BFINAL=1, BTYPE=00, LEN=0, NLEN=0xffff. + p[12 + xlen + 0] = 0x01; + p[12 + xlen + 1] = 0x00; + p[12 + xlen + 2] = 0x00; + p[12 + xlen + 3] = 0xff; + p[12 + xlen + 4] = 0xff; + + // Trailer: CRC32=0, ISIZE=0. + std::memset(p + 12 + xlen + 5, 0, 8); +} + +// Fill `out` with padding members until its size reaches exactly stripe_size. +void pad_to_stripe(std::vector& out, std::size_t stripe_size) { + while (stripe_size - out.size() >= PAD_MEMBER_MAX_SIZE) { + append_padding_member(out, 65535); + } + std::size_t remaining = stripe_size - out.size(); + if (remaining >= PAD_MEMBER_FIXED_OVERHEAD) { + append_padding_member(out, static_cast( + remaining - PAD_MEMBER_FIXED_OVERHEAD)); + } + // < 25 bytes leftover is dropped; next slot still starts at the + // declared stripe offset. +} + +class PaddedStripedWriter : public ParallelWriter { + public: + struct Chunk { + std::vector data; + std::shared_ptr> ack; + }; + + explicit PaddedStripedWriter(std::size_t stripe_size) + : stripe_size_(stripe_size) {} + + coro::CoroTask open(std::string path, std::size_t num_workers, + bool /*gzip_extension*/, + CoroScope* scope) override { + if (!scope) { + DFTRACER_UTILS_LOG_ERROR( + "PaddedStripedWriter requires a CoroScope to spawn its packer"); + co_return -1; + } + path_ = std::move(path); + ssize_t fd = co_await ::dftracer::utils::io::open( + path_.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + DFTRACER_UTILS_LOG_ERROR("Failed to open padded output: %s", + path_.c_str()); + co_return -1; + } + fd_ = static_cast(fd); + next_stripe_idx_.store(0, std::memory_order_relaxed); + per_worker_last_.assign(num_workers, std::nullopt); + + // Valid-gzip placeholder so callers that skip write_header still + // produce a file gunzip can walk. + std::vector pad; + pad.reserve(stripe_size_); + pad_to_stripe(pad, stripe_size_); + if (co_await pwrite_bytes(pad.data(), pad.size(), 0) != 0) { + co_return -1; + } + + // Set up the chunk channel + N pre-registered producers (one per + // worker). The packer exits once all producers are released. + channel_ = coro::make_channel(CHUNK_CHANNEL_CAPACITY); + producers_.reserve(num_workers); + for (std::size_t i = 0; i < num_workers; ++i) { + producers_.emplace_back(channel_->producer()); + } + + packer_future_ = + scope->spawn([this, consumer = channel_->consumer()]( + CoroScope& s) mutable -> coro::CoroTask { + co_return co_await run_packer(s, std::move(consumer)); + }); + co_return 0; + } + + coro::CoroTask write_header(ByteView data) override { + if (data.size() + PAD_MEMBER_FIXED_OVERHEAD > stripe_size_) { + DFTRACER_UTILS_LOG_ERROR( + "padded writer: header %zu + pad overhead exceeds stripe %zu", + data.size(), stripe_size_); + co_return -1; + } + std::vector buf; + buf.reserve(stripe_size_); + buf.insert( + buf.end(), reinterpret_cast(data.data()), + reinterpret_cast(data.data()) + data.size()); + pad_to_stripe(buf, stripe_size_); + co_return co_await pwrite_bytes(buf.data(), buf.size(), 0); + } + + coro::CoroTask write_chunk(std::size_t worker_idx, + ByteView data) override { + if (worker_idx >= producers_.size()) { + DFTRACER_UTILS_LOG_ERROR("padded writer: worker_idx %zu >= %zu", + worker_idx, producers_.size()); + co_return -1; + } + if (data.size() + PAD_MEMBER_FIXED_OVERHEAD > stripe_size_) { + DFTRACER_UTILS_LOG_ERROR( + "padded writer: chunk %zu + pad overhead exceeds stripe %zu", + data.size(), stripe_size_); + co_return -1; + } + Chunk c; + c.data.assign( + reinterpret_cast(data.data()), + reinterpret_cast(data.data()) + data.size()); + c.ack = coro::make_channel(1); + auto ack = c.ack; + bool ok = co_await producers_[worker_idx].send(std::move(c)); + if (!ok) co_return -1; + auto span = co_await ack->receive(); + if (!span) co_return -1; + per_worker_last_[worker_idx] = *span; + co_return 0; + } + + coro::CoroTask write_footer(ByteView data) override { + if (co_await drain_packer() != 0) co_return -1; + const auto stripes = next_stripe_idx_.load(std::memory_order_relaxed); + const auto offset = (stripes + 1) * stripe_size_; // +1 for header + co_return co_await pwrite_all(data, static_cast(offset)); + } + + coro::CoroTask close() override { + if (co_await drain_packer() != 0) { + if (fd_ >= 0) { + co_await ::dftracer::utils::io::close(fd_); + fd_ = -1; + } + co_return -1; + } + if (fd_ < 0) co_return 0; + auto rc = co_await ::dftracer::utils::io::close(fd_); + fd_ = -1; + co_return static_cast(rc); + } + + std::vector output_paths() const override { return {path_}; } + + std::optional last_member( + std::size_t worker_idx) const override { + if (worker_idx >= per_worker_last_.size()) return std::nullopt; + return per_worker_last_[worker_idx]; + } + + private: + // Drop all producer slots so the channel reports EOF to the packer, then + // wait for the packer to emit its final stripe. Safe to call twice. + coro::CoroTask drain_packer() { + if (!packer_drained_) { + producers_.clear(); + if (packer_future_.has_value()) { + auto rc = co_await *packer_future_; + packer_future_.reset(); + if (rc != 0) co_return rc; + } + packer_drained_ = true; + } + co_return 0; + } + + coro::CoroTask run_packer(CoroScope& parent_scope, + coro::ChannelConsumer consumer) { + std::vector buf; + buf.reserve(stripe_size_); + std::uint64_t current_stripe_idx = + next_stripe_idx_.fetch_add(1, std::memory_order_relaxed); + + std::deque> in_flight; + int final_rc = 0; + + auto await_one = [&]() -> coro::CoroTask { + auto f = std::move(in_flight.front()); + in_flight.pop_front(); + auto rc = co_await std::move(f); + if (rc != 0 && final_rc == 0) final_rc = rc; + }; + + auto launch_emit = [&](std::vector&& payload, + std::uint64_t emit_idx) -> coro::CoroTask { + while (in_flight.size() >= MAX_INFLIGHT_PWRITES) { + co_await await_one(); + } + in_flight.push_back(parent_scope.spawn( + [this, p = std::move(payload), + emit_idx](CoroScope&) mutable -> coro::CoroTask { + pad_to_stripe(p, stripe_size_); + const auto offset = (emit_idx + 1) * stripe_size_; + co_return co_await pwrite_bytes(p.data(), p.size(), + static_cast(offset)); + })); + co_return; + }; + + while (auto chunk = co_await consumer.receive()) { + if (!buf.empty() && + buf.size() + chunk->data.size() + PAD_MEMBER_FIXED_OVERHEAD > + stripe_size_) { + co_await launch_emit(std::move(buf), current_stripe_idx); + buf.clear(); + buf.reserve(stripe_size_); + current_stripe_idx = + next_stripe_idx_.fetch_add(1, std::memory_order_relaxed); + } + if (chunk->ack) { + MemberSpan span{ + (current_stripe_idx + 1) * stripe_size_ + buf.size(), + static_cast(chunk->data.size())}; + co_await chunk->ack->send(std::move(span)); + chunk->ack->close(); + } + buf.insert(buf.end(), chunk->data.begin(), chunk->data.end()); + } + if (!buf.empty()) { + co_await launch_emit(std::move(buf), current_stripe_idx); + } + + while (!in_flight.empty()) { + co_await await_one(); + } + co_return final_rc; + } + + coro::CoroTask pwrite_all(ByteView data, off_t offset) { + co_return co_await pwrite_bytes( + reinterpret_cast(data.data()), data.size(), + offset); + } + + coro::CoroTask pwrite_bytes(const std::uint8_t* bytes, + std::size_t size, off_t offset) { + if (size == 0) co_return 0; + std::size_t written = 0; + while (written < size) { + auto n = co_await ::dftracer::utils::io::pwrite( + fd_, bytes + written, size - written, + offset + static_cast(written)); + if (n <= 0) { + DFTRACER_UTILS_LOG_ERROR( + "padded writer pwrite failed at %lld on %s", + static_cast(offset), path_.c_str()); + co_return -1; + } + written += static_cast(n); + } + co_return 0; + } + + std::string path_; + int fd_ = -1; + std::size_t stripe_size_; + std::atomic next_stripe_idx_{0}; + + std::shared_ptr> channel_; + std::vector> producers_; + std::optional> packer_future_; + bool packer_drained_ = false; + std::vector> per_worker_last_; +}; + +} // namespace + +std::unique_ptr make_padded_striped_writer( + std::size_t stripe_size) { + return std::make_unique(stripe_size); +} + +} // namespace dftracer::utils::utilities::fileio::parallel diff --git a/src/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp b/src/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp new file mode 100644 index 00000000..f7388785 --- /dev/null +++ b/src/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp @@ -0,0 +1,135 @@ +#include +#include +#include +#include +#include + +#include + +namespace dftracer::utils::utilities::fileio::parallel { + +namespace { + +class ShardedWriter final : public ParallelWriter { + public: + coro::CoroTask open(std::string path, std::size_t num_workers, + bool gzip_extension, + CoroScope* /*scope*/) override { + base_path_ = std::move(path); + const std::string ext = gzip_extension ? ".gz" : ""; + shard_paths_.resize(num_workers); + shard_fds_.assign(num_workers, -1); + shard_offsets_.assign(num_workers, 0); + per_worker_last_.assign(num_workers, std::nullopt); + for (std::size_t i = 0; i < num_workers; ++i) { + shard_paths_[i] = base_path_ + ".shard_" + std::to_string(i) + ext; + ssize_t fd = co_await ::dftracer::utils::io::open( + shard_paths_[i].c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + DFTRACER_UTILS_LOG_ERROR("Failed to open shard: %s", + shard_paths_[i].c_str()); + co_return -1; + } + shard_fds_[i] = static_cast(fd); + } + co_return 0; + } + + coro::CoroTask write_header(ByteView data) override { + if (shard_fds_.empty()) co_return -1; + auto rc = co_await write_all(shard_fds_.front(), data); + if (rc == 0) shard_offsets_.front() += data.size(); + co_return rc; + } + + coro::CoroTask write_chunk(std::size_t worker_idx, + ByteView data) override { + if (worker_idx >= shard_fds_.size()) co_return -1; + const auto base = shard_offsets_[worker_idx]; + auto rc = co_await write_all(shard_fds_[worker_idx], data); + if (rc != 0) co_return rc; + shard_offsets_[worker_idx] += data.size(); + per_worker_last_[worker_idx] = MemberSpan{base, data.size()}; + co_return 0; + } + + coro::CoroTask write_footer(ByteView data) override { + if (shard_fds_.empty()) co_return -1; + auto rc = co_await write_all(shard_fds_.back(), data); + if (rc == 0) shard_offsets_.back() += data.size(); + co_return rc; + } + + coro::CoroTask close() override { + int status = 0; + for (auto& fd : shard_fds_) { + if (fd < 0) continue; + auto rc = co_await ::dftracer::utils::io::close(fd); + if (rc < 0) status = -1; + fd = -1; + } + co_return status; + } + + std::vector output_paths() const override { + return shard_paths_; + } + + std::optional last_member( + std::size_t worker_idx) const override { + if (worker_idx >= per_worker_last_.size()) return std::nullopt; + return per_worker_last_[worker_idx]; + } + + std::vector shard_base_offsets() const override { + std::vector bases(shard_offsets_.size(), 0); + std::uint64_t accum = 0; + for (std::size_t i = 0; i < shard_offsets_.size(); ++i) { + bases[i] = accum; + accum += shard_offsets_[i]; + } + return bases; + } + + private: + coro::CoroTask write_all(int fd, ByteView data) { + if (data.size() == 0) co_return 0; + const auto* bytes = reinterpret_cast(data.data()); + std::size_t written = 0; + while (written < data.size()) { + auto n = co_await ::dftracer::utils::io::write( + fd, bytes + written, data.size() - written); + if (n <= 0) { + DFTRACER_UTILS_LOG_ERROR("write failed on shard fd=%d", fd); + co_return -1; + } + written += static_cast(n); + } + co_return 0; + } + + std::string base_path_; + std::vector shard_paths_; + std::vector shard_fds_; + std::vector shard_offsets_; + std::vector> per_worker_last_; +}; + +} // namespace + +std::unique_ptr make_sharded_writer() { + return std::make_unique(); +} + +std::unique_ptr make_writer(const WriterConfig& cfg) { + if (cfg.layout == FileLayout::SHARDED) return make_sharded_writer(); + // Padded striped needs gzip and a large-enough stripe to guarantee a + // compressed flush fits one slot. Below the minimum, fall back to the + // atomic-byte-offset writer. + if (cfg.gzip && cfg.stripe_size >= MIN_PADDED_STRIPE_BYTES) { + return make_padded_striped_writer(cfg.stripe_size); + } + return make_striped_writer(); +} + +} // namespace dftracer::utils::utilities::fileio::parallel diff --git a/src/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp b/src/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp new file mode 100644 index 00000000..7d04dcce --- /dev/null +++ b/src/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::fileio::parallel { + +namespace { + +class StripedWriter final : public ParallelWriter { + public: + coro::CoroTask open(std::string path, std::size_t num_workers, + bool /*gzip_extension*/, + CoroScope* /*scope*/) override { + path_ = std::move(path); + ssize_t fd = co_await ::dftracer::utils::io::open( + path_.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + DFTRACER_UTILS_LOG_ERROR("Failed to open striped output: %s", + path_.c_str()); + co_return -1; + } + fd_ = static_cast(fd); + offset_.store(0, std::memory_order_relaxed); + per_worker_layout_.assign(std::max(num_workers, 1), + std::vector{}); + merged_layout_.clear(); + merged_layout_built_ = false; + co_return 0; + } + + coro::CoroTask write_header(ByteView data) override { + co_return co_await pwrite_all(data); + } + + coro::CoroTask write_chunk(std::size_t worker_idx, + ByteView data) override { + if (data.size() == 0) co_return 0; + const auto base = + offset_.fetch_add(data.size(), std::memory_order_relaxed); + // Each worker is sequential (one write_chunk in flight per worker), so + // no lock needed when appending to its own bucket. + if (worker_idx < per_worker_layout_.size()) { + per_worker_layout_[worker_idx].push_back({base, data.size()}); + } + const auto* bytes = reinterpret_cast(data.data()); + std::size_t written = 0; + while (written < data.size()) { + auto n = co_await ::dftracer::utils::io::pwrite( + fd_, bytes + written, data.size() - written, + static_cast(base + written)); + if (n <= 0) { + DFTRACER_UTILS_LOG_ERROR("pwrite failed on %s (offset=%llu)", + path_.c_str(), + static_cast(base)); + co_return -1; + } + written += static_cast(n); + } + co_return 0; + } + + coro::CoroTask write_footer(ByteView data) override { + co_return co_await pwrite_all(data); + } + + coro::CoroTask close() override { + if (fd_ < 0) co_return 0; + auto rc = co_await ::dftracer::utils::io::close(fd_); + fd_ = -1; + co_return static_cast(rc); + } + + std::vector output_paths() const override { return {path_}; } + + std::optional last_member( + std::size_t worker_idx) const override { + if (worker_idx >= per_worker_layout_.size()) return std::nullopt; + const auto& v = per_worker_layout_[worker_idx]; + if (v.empty()) return std::nullopt; + return v.back(); + } + + std::span member_layout() const override { + // Lazy merge after close: per-worker vectors -> single offset-sorted + // vector. Caller contract: only invoked after `close()`, no concurrent + // writers. + if (!merged_layout_built_) { + std::size_t total = 0; + for (const auto& v : per_worker_layout_) total += v.size(); + merged_layout_.clear(); + merged_layout_.reserve(total); + for (const auto& v : per_worker_layout_) { + merged_layout_.insert(merged_layout_.end(), v.begin(), v.end()); + } + std::sort(merged_layout_.begin(), merged_layout_.end(), + [](const MemberSpan& a, const MemberSpan& b) { + return a.offset < b.offset; + }); + merged_layout_built_ = true; + } + return std::span(merged_layout_); + } + + private: + coro::CoroTask pwrite_all(ByteView data) { + if (data.size() == 0) co_return 0; + const auto base = + offset_.fetch_add(data.size(), std::memory_order_relaxed); + const auto* bytes = reinterpret_cast(data.data()); + std::size_t written = 0; + while (written < data.size()) { + auto n = co_await ::dftracer::utils::io::pwrite( + fd_, bytes + written, data.size() - written, + static_cast(base + written)); + if (n <= 0) { + DFTRACER_UTILS_LOG_ERROR("pwrite failed on %s (offset=%llu)", + path_.c_str(), + static_cast(base)); + co_return -1; + } + written += static_cast(n); + } + co_return 0; + } + + std::string path_; + int fd_ = -1; + std::atomic offset_{0}; + std::vector> per_worker_layout_; + mutable std::vector merged_layout_; + mutable bool merged_layout_built_ = false; +}; + +} // namespace + +std::unique_ptr make_striped_writer() { + return std::make_unique(); +} + +} // namespace dftracer::utils::utilities::fileio::parallel diff --git a/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp b/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp index 97b15098..f254a92b 100644 --- a/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp +++ b/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp @@ -1,30 +1,37 @@ +#include #include #include +#include #include -#include +#include +#include #include +#include +#include +#include #include +#include #include #include +#include +#include #include +#include #include #include #include -#include -#include +#include #include #include namespace dftracer::utils::utilities::indexer { using composites::dft::internal::determine_index_path; +using composites::dft::visitors::BloomVisitor; +using composites::dft::visitors::HashTableVisitor; +using composites::dft::visitors::ManifestVisitor; using internal::IndexerFactory; -namespace rocks = dftracer::utils::rocksdb; - -// --------------------------------------------------------------------------- -// IndexBuildConfig builder methods -// --------------------------------------------------------------------------- IndexBuildConfig IndexBuildConfig::for_file(const std::string& path) { IndexBuildConfig cfg; @@ -42,22 +49,11 @@ IndexBuildConfig& IndexBuildConfig::with_checkpoint_size(std::size_t size) { return *this; } -IndexBuildConfig& IndexBuildConfig::with_index_threshold( - std::size_t threshold) { - index_threshold = threshold; - return *this; -} - IndexBuildConfig& IndexBuildConfig::with_force_rebuild(bool force) { force_rebuild = force; return *this; } -IndexBuildConfig& IndexBuildConfig::with_bloom(bool enable) { - build_bloom = enable; - return *this; -} - IndexBuildConfig& IndexBuildConfig::with_manifest(bool enable) { build_manifest = enable; return *this; @@ -75,8 +71,8 @@ IndexBuildConfig& IndexBuildConfig::with_bloom_dimensions( return *this; } -coro::CoroTask IndexBuilderUtility::process( - const IndexBuildConfig& config) { +static coro::CoroTask run_index_build( + IndexBuildConfig config) { IndexBuildResult result; result.file_path = config.file_path; @@ -85,15 +81,9 @@ coro::CoroTask IndexBuilderUtility::process( determine_index_path(config.file_path, config.index_dir); result.index_path = index_path; - // Check compressed file size against threshold (0 = always index). - std::uintmax_t file_sz = 0; - if (fs::exists(config.file_path)) { - file_sz = fs::file_size(config.file_path); - } - const bool below_threshold = - config.index_threshold != 0 && file_sz < config.index_threshold; - +#if DFTRACER_UTILS_LOGGER_LEVEL_DEBUG auto build_start = std::chrono::steady_clock::now(); +#endif auto indexer = IndexerFactory::create( config.file_path, index_path, @@ -106,15 +96,15 @@ coro::CoroTask IndexBuilderUtility::process( co_return result; } - // NOTE(perf): compute need_rebuild once: used for both skip decision - // and checkpoints_valid below. Avoids duplicate fingerprint check. - bool idx_exists = !below_threshold && indexer->exists(); + bool idx_exists = indexer->exists(); bool needs_rebuild = idx_exists ? indexer->need_rebuild() : true; - // Skip if index exists, is current, and all requested features present. - if (idx_exists && !config.force_rebuild && !needs_rebuild) { + // Skip if index exists, is current, all features present, + // and no extra visitors need to run. + if (idx_exists && !config.force_rebuild && !needs_rebuild && + config.extra_dft_visitors.empty()) { auto logical = internal::get_logical_path(config.file_path); - bool bloom_ok = !config.build_bloom || [&] { + bool bloom_ok = [&] { try { IndexDatabase db(index_path, dftracer::utils::rocksdb::RocksDatabase:: @@ -138,8 +128,8 @@ coro::CoroTask IndexBuilderUtility::process( }(); if (bloom_ok && manifest_ok) { - DFTRACER_UTILS_LOG_INFO("Skipping already-indexed file: %s", - config.file_path.c_str()); + DFTRACER_UTILS_LOG_DEBUG("Skipping already-indexed file: %s", + config.file_path.c_str()); result.success = true; result.was_skipped = true; result.index_created = true; @@ -147,33 +137,39 @@ coro::CoroTask IndexBuilderUtility::process( } } - // Resolve effective bloom dimensions. std::vector dims = - (config.build_bloom && config.bloom_dimensions.empty()) - ? default_bloom_dimensions() + config.bloom_dimensions.empty() + ? std::vector(DEFAULT_BLOOM_DIMENSIONS.begin(), + DEFAULT_BLOOM_DIMENSIONS.end()) : config.bloom_dimensions; - // Construct visitors. - std::optional bloom_visitor; + // Construct DFT event visitors and wrap in single dispatcher. + BloomVisitor bloom_visitor(config.bloom_config, dims); + HashTableVisitor hash_table_visitor; std::optional manifest_visitor; - internal::Indexer::VisitorList visitor_list; - if (config.build_bloom) { - bloom_visitor.emplace(config.bloom_config, dims); - visitor_list.emplace_back(*bloom_visitor); - } + composites::dft::DftEventDispatcher::VisitorList dft_visitors; + dft_visitors.emplace_back(bloom_visitor); + dft_visitors.emplace_back(hash_table_visitor); if (config.build_manifest) { manifest_visitor.emplace(); - visitor_list.emplace_back(*manifest_visitor); + dft_visitors.emplace_back(*manifest_visitor); + } + for (auto& extra : config.extra_dft_visitors) { + dft_visitors.emplace_back(extra); } + composites::dft::DftEventDispatcher dispatcher(std::move(dft_visitors)); + internal::Indexer::VisitorList visitor_list; + visitor_list.emplace_back(dispatcher); + // Decide whether checkpoints need rebuilding. // Reuses the need_rebuild result computed above. bool checkpoints_valid = !config.force_rebuild && idx_exists && !needs_rebuild; if (checkpoints_valid && !visitor_list.empty()) { - // Checkpoints exist — only need a streaming pass for visitors. + // Checkpoints exist, only need a streaming pass for visitors. using fileio::lines::sources::async_streaming_gz_lines; for (auto& v : visitor_list) { v.get().begin(0); @@ -193,12 +189,17 @@ coro::CoroTask IndexBuilderUtility::process( } ckpt_idx = new_ckpt; } + auto buffer = std::make_shared(line.content); + std::string_view sv(buffer->data(), buffer->size()); for (auto& v : visitor_list) { - v.get().on_line(line.content, ckpt_idx); + v.get().on_line(sv, buffer, ckpt_idx); + if (v.get().wants_drain()) { + co_await v.get().drain_pending(); + } } } } else { - // Need full checkpoint build — visitors run inline. + // Need full checkpoint build, visitors run inline. if (!visitor_list.empty()) { indexer->set_visitors(std::move(visitor_list)); } @@ -208,10 +209,10 @@ coro::CoroTask IndexBuilderUtility::process( result.total_lines = indexer->get_num_lines(); result.chunks_processed = static_cast(indexer->get_checkpoints().size()); + result.events_processed = + static_cast(bloom_visitor.total_events()); - // Persist visitor data into the `.dftindex` store only when the file - // meets the size threshold (or threshold is disabled). - if (!below_threshold && (config.build_bloom || config.build_manifest)) { + { const std::string& built_index_path = indexer->get_index_path(); try { @@ -219,27 +220,24 @@ coro::CoroTask IndexBuilderUtility::process( auto logical = internal::get_logical_path(config.file_path); const auto hash = internal::calculate_file_hash(config.file_path); - auto* db_ptr = &db; - auto* logical_ptr = &logical; - auto* config_ptr = &config; - auto* bloom_visitor_ptr = &bloom_visitor; - auto* manifest_visitor_ptr = &manifest_visitor; - co_await rocks::run([db_ptr, logical_ptr, hash, config_ptr, - bloom_visitor_ptr, manifest_visitor_ptr] { - int fid = - db_ptr->get_or_create_file_info(*logical_ptr, hash); - internal::TransactionScope txn(*db_ptr); - if (config_ptr->build_bloom && *bloom_visitor_ptr) { - db_ptr->init_bloom_schema(); - db_ptr->delete_chunk_statistics(fid); - (*bloom_visitor_ptr)->finalize(*db_ptr, fid); - } - if (config_ptr->build_manifest && *manifest_visitor_ptr) { - db_ptr->init_manifest_schema(); - (*manifest_visitor_ptr)->finalize(*db_ptr, fid); - } - txn.commit(); - }); + + IndexFileEntryCapability caps = + IndexFileEntryCapability::INDEXING_COMPLETE | + IndexFileEntryCapability::BLOOM | + IndexFileEntryCapability::CHECKPOINTS | + IndexFileEntryCapability::FILE_SUMMARY; + if (config.build_manifest && manifest_visitor) { + caps |= IndexFileEntryCapability::MANIFEST; + } + auto writer = db.begin_write(); + int fid = writer->get_or_create_file_info(logical, hash, caps); + writer->delete_chunk_statistics(fid); + bloom_visitor.finalize(*writer, fid); + hash_table_visitor.finalize(*writer, fid); + if (config.build_manifest && manifest_visitor) { + manifest_visitor->finalize(*writer, fid); + } + writer->commit(); } catch (const std::exception& e) { result.error_message = std::string("Failed to persist index data: ") + e.what(); @@ -250,16 +248,18 @@ coro::CoroTask IndexBuilderUtility::process( } } - result.index_created = !below_threshold; + result.index_created = true; result.success = true; +#if DFTRACER_UTILS_LOGGER_LEVEL_DEBUG auto build_end = std::chrono::steady_clock::now(); double elapsed_s = std::chrono::duration(build_end - build_start).count(); - DFTRACER_UTILS_LOG_INFO( + DFTRACER_UTILS_LOG_DEBUG( "Built index for %s (%zu chunks, %zu lines, %.2fs)", config.file_path.c_str(), result.chunks_processed, result.total_lines, elapsed_s); +#endif } catch (const std::exception& e) { result.error_message = e.what(); DFTRACER_UTILS_LOG_ERROR("IndexBuilder failed for %s: %s", @@ -269,4 +269,639 @@ coro::CoroTask IndexBuilderUtility::process( co_return result; } +coro::CoroTask IndexBuilderUtility::process( + const IndexBuildConfig& config) { + return run_index_build(config); +} + +static coro::CoroTask process_batch_per_file( + CoroScope* scope, std::shared_ptr shared_config) { + auto results = std::make_shared>( + shared_config->file_paths.size()); + auto indexed = std::make_shared>(0); + auto skipped = std::make_shared>(0); + auto failed = std::make_shared>(0); + auto total_events = std::make_shared>(0); + + auto file_paths = std::make_shared>( + std::move(shared_config->file_paths)); + const auto parallelism = shared_config->parallelism; + + co_await scope->scope([file_paths, results, indexed, skipped, failed, + total_events, shared_config, parallelism]( + CoroScope& child) -> coro::CoroTask { + auto file_chan = coro::make_channel(parallelism * 2); + + child.spawn( + [ch = file_chan->producer(), num_files = file_paths->size()]( + CoroScope&) mutable -> coro::CoroTask { + auto guard = ch.guard(); + for (std::size_t i = 0; i < num_files; ++i) { + if (!co_await ch.send(i)) co_return; + } + co_return; + }); + + for (std::size_t w = 0; w < parallelism; ++w) { + child.spawn([file_chan, file_paths, results, shared_config, indexed, + skipped, failed, + total_events](CoroScope&) -> coro::CoroTask { + while (auto idx_opt = co_await file_chan->receive()) { + std::size_t idx = *idx_opt; + IndexBuilderUtility builder; + auto file_config = + IndexBuildConfig::for_file((*file_paths)[idx]) + .with_index_dir(shared_config->index_dir) + .with_checkpoint_size( + shared_config->checkpoint_size) + .with_force_rebuild(shared_config->force_rebuild) + .with_manifest(shared_config->build_manifest) + .with_bloom_config(shared_config->bloom_config) + .with_bloom_dimensions( + shared_config->bloom_dimensions); + + auto result = co_await builder.process(file_config); + + if (result.was_skipped) { + skipped->fetch_add(1, std::memory_order_relaxed); + } else if (result.success) { + indexed->fetch_add(1, std::memory_order_relaxed); + total_events->fetch_add(result.events_processed, + std::memory_order_relaxed); + } else { + failed->fetch_add(1, std::memory_order_relaxed); + } + (*results)[idx] = std::move(result); + } + co_return; + }); + } + co_return; + }); + + IndexBuildBatchResult batch_result; + batch_result.results = std::move(*results); + batch_result.indexed = indexed->load(std::memory_order_relaxed); + batch_result.skipped = skipped->load(std::memory_order_relaxed); + batch_result.failed = failed->load(std::memory_order_relaxed); + batch_result.total_events = total_events->load(std::memory_order_relaxed); + co_return batch_result; +} + +namespace { + +struct PreparedFile { + std::size_t index; + std::string file_path; + std::string logical_path; + std::string index_path; + std::uint64_t file_hash = 0; + int file_id = 0; + IndexBuildBatchConfig::FileSlice slice; +}; + +struct ParsedBloomJob { + PreparedFile identity; + IndexBuildResult result; + internal::gzip::GzipBuildArtifacts artifacts; + std::unique_ptr bloom_visitor; + std::unique_ptr hash_table_visitor; + std::unique_ptr manifest_visitor; + std::vector> + extra_visitors; +}; + +std::vector prepare_file_identities( + const std::string& index_path, const std::vector& file_paths, + bool build_manifest) { + IndexDatabase db(index_path); + auto writer = db.begin_write(); + + std::vector prepared; + prepared.reserve(file_paths.size()); + for (std::size_t i = 0; i < file_paths.size(); ++i) { + PreparedFile pf; + pf.index = i; + pf.file_path = file_paths[i]; + pf.logical_path = internal::get_logical_path(file_paths[i]); + pf.index_path = index_path; + pf.file_hash = internal::calculate_file_hash(file_paths[i]); + IndexFileEntryCapability caps = + IndexFileEntryCapability::BLOOM | + IndexFileEntryCapability::CHECKPOINTS | + IndexFileEntryCapability::FILE_SUMMARY | + IndexFileEntryCapability::INDEXING_COMPLETE; + if (build_manifest) { + caps |= IndexFileEntryCapability::MANIFEST; + } + pf.file_id = writer->get_or_create_file_info(pf.logical_path, + pf.file_hash, caps); + prepared.push_back(std::move(pf)); + } + writer->commit(); + return prepared; +} + +} // namespace + +struct BatchWriteState { + std::shared_ptr> results; + std::shared_ptr>> parsed_jobs; + std::shared_ptr> prepared; + std::shared_ptr> bloom_dims; + std::string index_path; + IndexBuildBatchMetrics metrics; + composites::dft::indexing::ChunkIndexerConfig bloom_config; + std::size_t num_files = 0; + std::size_t parallelism = 0; + std::size_t checkpoint_size = 0; + bool build_manifest = false; + IndexBuildBatchConfig::DftVisitorFactory visitor_factory; + IndexBuildBatchConfig::SinkFactory sink_factory; + IndexBuildBatchConfig::SinkCommitFn sink_commit; +}; + +// Parse one file at a time (work-stealing via atomic next_index), and stream +// the resulting bloom/hash/manifest payload directly to the write channel so +// write workers can begin committing before the parse phase finishes. The +// extra_visitors and result are left in parsed_jobs[idx] for +// finalize_batch_result; the channel item only carries what the write phase +// needs. +static coro::CoroTask parse_and_emit_worker( + CoroScope* scope, std::atomic* next_index_ptr, + std::vector* results_ptr, + std::vector>* parsed_jobs_ptr, + std::vector* prepared_ptr, std::size_t checkpoint_size, + composites::dft::indexing::ChunkIndexerConfig bloom_config, + const std::vector* bloom_dims_ptr, + std::atomic* parse_ns_ptr, + const IndexBuildBatchConfig::DftVisitorFactory* visitor_factory_ptr, + bool build_manifest, coro::ChannelProducer ch) { + namespace gzip_indexer = internal::gzip; + auto guard = ch.guard(); + + while (true) { + const auto idx = + next_index_ptr->fetch_add(1, std::memory_order_relaxed); + if (idx >= prepared_ptr->size()) break; + + const auto& pf = (*prepared_ptr)[idx]; + IndexBuildResult result; + result.file_path = pf.file_path; + result.index_path = pf.index_path; + auto t0 = std::chrono::steady_clock::now(); + + ParsedBloomJob job; + job.identity = pf; + bool parse_ok = false; + try { + composites::dft::DftEventDispatcher::VisitorList dft_vis; + // Built-in file-scoped visitors are skipped for sliced files + // where file-scoped writes are disabled (non-first slice of a + // cross-rank-split file). BloomVisitor::ensure_chunk would also + // resize chunks_ with a large checkpoint_idx_base. + if (!pf.slice.skip_file_scoped_writes) { + job.bloom_visitor = std::make_unique( + bloom_config, *bloom_dims_ptr); + job.hash_table_visitor = std::make_unique(); + dft_vis.emplace_back(*job.bloom_visitor); + dft_vis.emplace_back(*job.hash_table_visitor); + if (build_manifest) { + job.manifest_visitor = std::make_unique(); + dft_vis.emplace_back(*job.manifest_visitor); + } + } + if (visitor_factory_ptr && *visitor_factory_ptr) { + job.extra_visitors = (*visitor_factory_ptr)(pf.file_path); + for (auto& v : job.extra_visitors) { + dft_vis.emplace_back(*v); + } + } + + composites::dft::DftEventDispatcher batch_dispatcher( + std::move(dft_vis)); + internal::Indexer::VisitorList visitors; + visitors.emplace_back(batch_dispatcher); + + gzip_indexer::GzipMemberSlice slice_arg; + const gzip_indexer::GzipMemberSlice* slice_ptr = nullptr; + if (pf.slice.members != nullptr && + pf.slice.member_end > pf.slice.member_begin) { + slice_arg.members = pf.slice.members; + slice_arg.member_begin = pf.slice.member_begin; + slice_arg.member_end = pf.slice.member_end; + slice_arg.checkpoint_idx_base = pf.slice.checkpoint_idx_base; + slice_ptr = &slice_arg; + } + auto arts = co_await gzip_indexer::build_gzip_index_artifacts( + pf.file_path, checkpoint_size, visitors, scope, slice_ptr); + if (!arts) { + result.error_message = "Failed to build gzip index artifacts"; + } else { + job.artifacts = std::move(*arts); + result.total_lines = + static_cast(job.artifacts.total_lines); + result.chunks_processed = job.artifacts.checkpoints.size(); + if (job.bloom_visitor) { + result.events_processed = static_cast( + job.bloom_visitor->total_events()); + } + result.index_created = true; + result.success = true; + job.result = result; + for (auto& v : job.extra_visitors) { + co_await v->on_file_complete(); + } + parse_ok = true; + } + } catch (const std::exception& e) { + result.error_message = e.what(); + } + + auto t1 = std::chrono::steady_clock::now(); + parse_ns_ptr->fetch_add( + static_cast( + std::chrono::duration_cast(t1 - t0) + .count()), + std::memory_order_relaxed); + + if (!parse_ok) { + (*results_ptr)[idx] = std::move(result); + continue; + } + + // Sliced rank with member_begin > 0: skip file-scoped channel send; + // aggregation SSTs already produced via extra visitors are kept in + // parsed_jobs[idx] for downstream collection. + if (pf.slice.skip_file_scoped_writes) { + (*results_ptr)[idx] = std::move(result); + (*parsed_jobs_ptr)[idx] = std::move(job); + continue; + } + + // Build the channel-bound payload (move bloom/hash/manifest into it), + // and leave extra_visitors + result behind in parsed_jobs[idx]. + internal::ParsedIndexJob send_job; + send_job.file_id = pf.file_id; + send_job.file_path = pf.file_path; + send_job.artifacts = std::move(job.artifacts); + send_job.bloom_visitor = std::move(job.bloom_visitor); + send_job.hash_table_visitor = std::move(job.hash_table_visitor); + send_job.manifest_visitor = std::move(job.manifest_visitor); + send_job.success = true; + + ParsedBloomJob holder; + holder.identity = pf; + holder.extra_visitors = std::move(job.extra_visitors); + holder.result = result; + (*parsed_jobs_ptr)[idx] = std::move(holder); + (*results_ptr)[idx] = std::move(result); + + if (!co_await ch.send(std::move(send_job))) co_return; + } + co_return; +} + +// Streaming parse + write pipeline: parse-and-emit workers and write workers +// run concurrently inside one scope. Parse workers act as multiple producers +// on the write channel (each holds its own ProducerGuard); the channel closes +// for sends when all parse workers exit, after which write workers finish +// draining buffered items, do their final flush, and exit. Memory is bounded +// by the channel capacity (write_workers * WRITE_BATCH_SIZE) so peak heap +// stays bounded regardless of total file count. +static coro::CoroTask run_streaming_pipeline(CoroScope* scope, + BatchWriteState* state) { + static constexpr std::size_t WRITE_BATCH_SIZE = 64; + const auto parse_workers = state->parallelism; + // Write workers are decoupled from parse workers to control SST count. + // Floor (parse_workers / 3) reflects the empirical write-vs-parse CPU + // ratio for the bloom indexer (~3x). Ceiling (num_files / batch_size) + // ensures large workloads, where total SST count is bounded by + // ceil(num_files / batch_size) anyway, get full parallelism. Both are + // capped at parse_workers and given a minimum of 4 for small workloads. + const auto write_workers = std::min( + parse_workers, std::max( + 4, std::max(parse_workers / 3, + state->num_files / WRITE_BATCH_SIZE))); + + DFTRACER_UTILS_LOG_INFO( + "IndexBatch: streaming pipeline begin (%zu files, parse_workers=%zu " + "write_workers=%zu)", + state->num_files, parse_workers, write_workers); + + // GCC 12 coroutine bug: capturing shared_ptr by value in coroutine + // lambdas corrupts refcount. Keep shared_ptrs at this scope and pass + // raw pointers to lambdas. + auto write_chan = coro::make_channel( + write_workers * WRITE_BATCH_SIZE); + auto writer_metrics = std::make_shared(); + + // Only open the RocksDB-backed DB when no external sink factory is + // provided. The distributed SST path routes writes through caller-owned + // SstWriterContext instances and must not hold a process-exclusive + // RocksDB handle on the target index dir. + std::shared_ptr writer_db; + if (!state->sink_factory) { + writer_db = std::make_shared(state->index_path); + } + + auto next_index = std::make_shared>(0); + auto parse_ns = std::make_shared>(0); + auto bloom_config_holder = + std::make_shared( + state->bloom_config); + + auto* next_index_ptr = next_index.get(); + auto* parse_ns_ptr = parse_ns.get(); + auto* results_ptr = state->results.get(); + auto* parsed_jobs_ptr = state->parsed_jobs.get(); + auto* prepared_ptr = state->prepared.get(); + auto* db_ptr = writer_db.get(); + auto* metrics_ptr = writer_metrics.get(); + auto* write_chan_ptr = write_chan.get(); + const auto* bloom_config_ptr = bloom_config_holder.get(); + const auto* bloom_dims_ptr = state->bloom_dims.get(); + const auto checkpoint_size = state->checkpoint_size; + const bool build_manifest = state->build_manifest; + const IndexBuildBatchConfig::DftVisitorFactory* visitor_factory_ptr = + state->visitor_factory ? &state->visitor_factory : nullptr; + auto* sink_factory_ptr = &state->sink_factory; + auto* sink_commit_ptr = &state->sink_commit; + + co_await scope->scope([parse_workers, write_workers, next_index_ptr, + parse_ns_ptr, results_ptr, parsed_jobs_ptr, + prepared_ptr, checkpoint_size, bloom_config_ptr, + bloom_dims_ptr, visitor_factory_ptr, build_manifest, + write_chan_ptr, db_ptr, metrics_ptr, + sink_factory_ptr, sink_commit_ptr]( + CoroScope& child) -> coro::CoroTask { + for (std::size_t w = 0; w < parse_workers; ++w) { + child.spawn( + [next_index_ptr, parse_ns_ptr, results_ptr, parsed_jobs_ptr, + prepared_ptr, checkpoint_size, bloom_config_ptr, + bloom_dims_ptr, visitor_factory_ptr, build_manifest, + ch = write_chan_ptr->producer()]( + CoroScope& own_scope) mutable -> coro::CoroTask { + co_await parse_and_emit_worker( + &own_scope, next_index_ptr, results_ptr, + parsed_jobs_ptr, prepared_ptr, checkpoint_size, + *bloom_config_ptr, bloom_dims_ptr, parse_ns_ptr, + visitor_factory_ptr, build_manifest, std::move(ch)); + }); + } + + for (std::size_t w = 0; w < write_workers; ++w) { + child.spawn([write_chan_ptr, db_ptr, metrics_ptr, sink_factory_ptr, + sink_commit_ptr](CoroScope&) -> coro::CoroTask { + if (*sink_factory_ptr) { + co_await internal::index_batch_write_worker( + write_chan_ptr, WRITE_BATCH_SIZE, metrics_ptr, + *sink_factory_ptr, *sink_commit_ptr); + } else { + co_await internal::index_batch_write_worker( + write_chan_ptr, WRITE_BATCH_SIZE, metrics_ptr, + [db_ptr] { return db_ptr->begin_write(); }, + [](IndexBatchSink& sink) { + static_cast(sink) + .commit(); + }); + } + }); + } + co_return; + }); + + state->metrics.parse_ns = parse_ns->load(std::memory_order_relaxed); + state->metrics.files_parsed = state->num_files; + state->metrics.write_ns = + writer_metrics->write_ns.load(std::memory_order_relaxed); + state->metrics.files_written = + writer_metrics->files_written.load(std::memory_order_relaxed); + DFTRACER_UTILS_LOG_INFO( + "IndexBatch: streaming pipeline complete (parsed=%zu written=%zu)", + state->num_files, state->metrics.files_written); + co_return; +} + +static std::unique_ptr init_batch_write_state( + IndexBuildBatchConfig& config) { + auto state = std::make_unique(); + state->num_files = config.file_paths.size(); + state->parallelism = config.parallelism; + state->checkpoint_size = config.checkpoint_size; + state->bloom_config = config.bloom_config; + state->build_manifest = config.build_manifest; + state->bloom_dims = std::make_shared>( + config.bloom_dimensions.empty() + ? std::vector(DEFAULT_BLOOM_DIMENSIONS.begin(), + DEFAULT_BLOOM_DIMENSIONS.end()) + : std::move(config.bloom_dimensions)); + state->results = + std::make_shared>(state->num_files); + state->index_path = + determine_index_path(config.file_paths.front(), config.index_dir); + if (!config.file_slices.empty() && + config.file_slices.size() != config.file_paths.size()) { + throw std::runtime_error( + "file_slices.size() must match file_paths.size() (or be empty)"); + } + if (!config.preassigned_file_ids.empty()) { + if (config.preassigned_file_ids.size() != config.file_paths.size()) { + throw std::runtime_error( + "preassigned_file_ids.size() must match file_paths.size()"); + } + // Distributed path: coordinator has already registered files and + // assigned ids. Skip the DEFAULT-CF registry open/write step. + std::vector prepared; + prepared.reserve(config.file_paths.size()); + for (std::size_t i = 0; i < config.file_paths.size(); ++i) { + PreparedFile pf; + pf.index = i; + pf.file_path = config.file_paths[i]; + pf.logical_path = internal::get_logical_path(config.file_paths[i]); + pf.index_path = state->index_path; + pf.file_hash = internal::calculate_file_hash(config.file_paths[i]); + pf.file_id = config.preassigned_file_ids[i]; + if (!config.file_slices.empty()) pf.slice = config.file_slices[i]; + prepared.push_back(std::move(pf)); + } + state->prepared = + std::make_shared>(std::move(prepared)); + } else { + state->prepared = + std::make_shared>(prepare_file_identities( + state->index_path, config.file_paths, config.build_manifest)); + if (!config.file_slices.empty()) { + auto& prepared = *state->prepared; + for (std::size_t i = 0; i < prepared.size(); ++i) { + prepared[i].slice = config.file_slices[i]; + } + } + } + state->parsed_jobs = + std::make_shared>>( + state->num_files); + if (config.dft_visitor_factory) { + state->visitor_factory = std::move(config.dft_visitor_factory); + } + state->sink_factory = std::move(config.sink_factory); + state->sink_commit = std::move(config.sink_commit); + if (static_cast(state->sink_factory) != + static_cast(state->sink_commit)) { + throw std::runtime_error( + "IndexBuildBatchConfig: sink_factory and sink_commit must be set " + "together (either both null for the default RocksDB path, or " + "both non-null for the distributed SST path)."); + } + return state; +} + +static void finalize_batch_result(BatchWriteState* state, + IndexBuildBatchResult* out) { + out->results = std::move(*state->results); + out->metrics = state->metrics; + out->metrics.files_enqueued = state->num_files; + + out->extra_visitors.resize(state->num_files); + for (std::size_t i = 0; i < state->num_files; ++i) { + auto& job_opt = (*state->parsed_jobs)[i]; + if (job_opt && !job_opt->extra_visitors.empty()) { + out->extra_visitors[i] = std::move(job_opt->extra_visitors); + } + } + + for (const auto& r : out->results) { + if (r.was_skipped) { + out->skipped++; + } else if (r.success) { + out->indexed++; + out->total_events += r.events_processed; + } else { + out->failed++; + } + } +} + +static void run_rebuild_root_summaries(const std::string& index_path) { + IndexDatabase db(index_path); + auto writer = db.begin_write(); + writer->rebuild_root_summaries(); + writer->commit(); +} + +static coro::CoroTask run_single_batch( + CoroScope* scope, IndexBuildBatchConfig chunk_config) { + auto state = init_batch_write_state(chunk_config); + co_await run_streaming_pipeline(scope, state.get()); + IndexBuildBatchResult partial; + finalize_batch_result(state.get(), &partial); + co_return partial; +} + +static void merge_partial_into(IndexBuildBatchResult& out, + IndexBuildBatchResult partial) { + for (auto& r : partial.results) { + out.results.push_back(std::move(r)); + } + out.indexed += partial.indexed; + out.skipped += partial.skipped; + out.failed += partial.failed; + out.total_events += partial.total_events; + out.metrics.parse_ns += partial.metrics.parse_ns; + out.metrics.write_ns += partial.metrics.write_ns; + out.metrics.files_enqueued += partial.metrics.files_enqueued; + out.metrics.files_parsed += partial.metrics.files_parsed; + out.metrics.files_written += partial.metrics.files_written; + for (auto& ev : partial.extra_visitors) { + out.extra_visitors.push_back(std::move(ev)); + } +} + +static coro::CoroTask run_batch_write_pipeline( + CoroScope* scope, std::shared_ptr config_ptr) { + const bool do_rebuild = config_ptr->rebuild_root_summaries; + const std::size_t flush_every = config_ptr->flush_every_files; + const std::size_t total = config_ptr->file_paths.size(); + const std::size_t chunk_size = + (flush_every > 0 && flush_every < total) ? flush_every : total; + + IndexBuildBatchResult result; + const auto index_path = determine_index_path(config_ptr->file_paths.front(), + config_ptr->index_dir); + + const std::size_t num_sub_batches = (total + chunk_size - 1) / chunk_size; + std::size_t sub_batch_idx = 0; + for (std::size_t start = 0; start < total; start += chunk_size) { + const std::size_t end = std::min(start + chunk_size, total); + DFTRACER_UTILS_LOG_INFO( + "IndexBatch: sub-batch %zu/%zu begin (files %zu..%zu of %zu)", + sub_batch_idx + 1, num_sub_batches, start, end - 1, total); + IndexBuildBatchConfig chunk_config; + chunk_config.file_paths.assign( + config_ptr->file_paths.begin() + static_cast(start), + config_ptr->file_paths.begin() + static_cast(end)); + if (!config_ptr->preassigned_file_ids.empty()) { + chunk_config.preassigned_file_ids.assign( + config_ptr->preassigned_file_ids.begin() + + static_cast(start), + config_ptr->preassigned_file_ids.begin() + + static_cast(end)); + } + if (!config_ptr->file_slices.empty()) { + chunk_config.file_slices.assign( + config_ptr->file_slices.begin() + + static_cast(start), + config_ptr->file_slices.begin() + + static_cast(end)); + } + chunk_config.sink_factory = config_ptr->sink_factory; + chunk_config.sink_commit = config_ptr->sink_commit; + chunk_config.index_dir = config_ptr->index_dir; + chunk_config.checkpoint_size = config_ptr->checkpoint_size; + chunk_config.parallelism = config_ptr->parallelism; + chunk_config.force_rebuild = config_ptr->force_rebuild; + chunk_config.build_manifest = config_ptr->build_manifest; + chunk_config.bloom_config = config_ptr->bloom_config; + chunk_config.bloom_dimensions = config_ptr->bloom_dimensions; + chunk_config.use_batch_write = true; + chunk_config.rebuild_root_summaries = false; + chunk_config.dft_visitor_factory = config_ptr->dft_visitor_factory; + + auto partial = + co_await run_single_batch(scope, std::move(chunk_config)); + if (config_ptr->extra_visitors_drain) { + auto drained = std::move(partial.extra_visitors); + partial.extra_visitors.clear(); + config_ptr->extra_visitors_drain(std::move(drained)); + } + DFTRACER_UTILS_LOG_INFO( + "IndexBatch: sub-batch %zu/%zu complete (indexed=%zu skipped=%zu " + "failed=%zu)", + sub_batch_idx + 1, num_sub_batches, partial.indexed, + partial.skipped, partial.failed); + merge_partial_into(result, std::move(partial)); + ++sub_batch_idx; + } + + config_ptr.reset(); + + if (do_rebuild) { + run_rebuild_root_summaries(index_path); + } + + co_return result; +} + +coro::CoroTask IndexBatchBuilderUtility::process( + CoroScope* scope, std::shared_ptr config_ptr) { + if (!config_ptr || config_ptr->file_paths.empty()) { + co_return IndexBuildBatchResult{}; + } + if (config_ptr->use_batch_write) { + co_return co_await run_batch_write_pipeline(scope, + std::move(config_ptr)); + } + co_return co_await process_batch_per_file(scope, std::move(config_ptr)); +} + } // namespace dftracer::utils::utilities::indexer diff --git a/src/dftracer/utils/utilities/indexer/index_database.cpp b/src/dftracer/utils/utilities/indexer/index_database.cpp index 4cc3165c..453787e1 100644 --- a/src/dftracer/utils/utilities/indexer/index_database.cpp +++ b/src/dftracer/utils/utilities/indexer/index_database.cpp @@ -1,17 +1,27 @@ #include #include #include +#include +#include +#include +#include #include #include +#include +#include #include #include +#include +#include #include +#include #include #include #include #include #include +#include #include #include @@ -19,12 +29,13 @@ namespace dftracer::utils::utilities::indexer { namespace queries = composites::dft::indexing::queries; namespace rocks = dftracer::utils::rocksdb; +namespace cf = rocks::cf; -using internal::IndexerError; +using namespace internal; namespace { -constexpr std::uint32_t kSchemaVersion = 1; +constexpr std::uint32_t SCHEMA_VERSION = 1; [[noreturn]] void throw_db_error(std::string_view message, const ::rocksdb::Status& status) { @@ -32,80 +43,6 @@ constexpr std::uint32_t kSchemaVersion = 1; std::string(message) + ": " + status.ToString()); } -void append_u8(std::string& out, std::uint8_t value) { - out.push_back(static_cast(value)); -} - -void append_i64(std::string& out, std::int64_t value) { - rocks::KeyCodec::append_be64(out, static_cast(value)); -} - -void append_u64(std::string& out, std::uint64_t value) { - rocks::KeyCodec::append_be64(out, value); -} - -void append_double(std::string& out, double value) { - static_assert(sizeof(double) == sizeof(std::uint64_t)); - std::uint64_t bits = 0; - std::memcpy(&bits, &value, sizeof(bits)); - append_u64(out, bits); -} - -void append_string(std::string& out, std::string_view value) { - rocks::KeyCodec::append_be32(out, static_cast(value.size())); - out.append(value.data(), value.size()); -} - -void append_blob(std::string& out, std::span blob) { - rocks::KeyCodec::append_be32(out, static_cast(blob.size())); - out.append(reinterpret_cast(blob.data()), blob.size()); -} - -class Cursor { - public: - explicit Cursor(std::string_view data) : data_(data) {} - - std::uint8_t u8() { return static_cast(take(1)[0]); } - - std::uint32_t u32() { return rocks::KeyCodec::decode_be32(take(4)); } - - std::uint64_t u64() { return rocks::KeyCodec::decode_be64(take(8)); } - - std::int64_t i64() { return static_cast(u64()); } - - double f64() { - std::uint64_t bits = u64(); - double value = 0.0; - std::memcpy(&value, &bits, sizeof(value)); - return value; - } - - std::string str() { - auto len = static_cast(u32()); - auto bytes = take(len); - return std::string(bytes.data(), bytes.size()); - } - - std::vector blob() { - auto len = static_cast(u32()); - auto bytes = take(len); - return std::vector(bytes.begin(), bytes.end()); - } - - private: - std::string_view take(std::size_t len) { - if (offset_ + len > data_.size()) { - throw std::runtime_error("Corrupt RocksDB payload"); - } - auto chunk = data_.substr(offset_, len); - offset_ += len; - return chunk; - } - - std::string_view data_; - std::size_t offset_ = 0; -}; - std::string file_lookup_key(std::string_view logical_name) { return std::string("f|") + std::string(logical_name); } @@ -116,16 +53,14 @@ std::string file_reverse_key(int file_id) { return key; } -std::string next_file_id_key() { return "_next_file_id"; } std::string schema_version_key() { return "_schema_version"; } -std::string encode_file_record(int file_id, std::uint64_t file_hash) { - std::string value; - rocks::KeyCodec::append_be32(value, static_cast(file_id)); - append_u64(value, 0); - append_u64(value, 0); - append_u64(value, file_hash); - return value; +IndexFileEntryCapability decode_file_capabilities(std::string_view record) { + if (record.size() < 5) { + return IndexFileEntryCapability::NONE; + } + return static_cast( + static_cast(record[4])); } int decode_file_id(std::string_view record) { @@ -135,6 +70,13 @@ int decode_file_id(std::string_view record) { return static_cast(rocks::KeyCodec::decode_be32(record.substr(0, 4))); } +int decode_prefixed_file_id(std::string_view key) { + if (key.size() < 4) { + throw std::runtime_error("Corrupt file-prefixed key"); + } + return static_cast(rocks::KeyCodec::decode_be32(key.substr(0, 4))); +} + std::uint64_t decode_file_hash(std::string_view record) { if (record.size() < 28) { throw std::runtime_error("Corrupt file record"); @@ -142,41 +84,7 @@ std::uint64_t decode_file_hash(std::string_view record) { return rocks::KeyCodec::decode_be64(record.substr(20, 8)); } -std::string prefix_for_file(int file_id) { - return rocks::KeyCodec::encode_be32(static_cast(file_id)); -} - -std::string make_hash_owner_key(int file_id, std::string_view dimension, - std::string_view hash_value) { - std::string key("o|"); - rocks::KeyCodec::append_be32(key, static_cast(file_id)); - key.push_back('\0'); - key.append(dimension); - key.push_back('\0'); - key.append(hash_value); - return key; -} - -std::string make_hash_forward_key(std::string_view dimension, - std::string_view hash_value) { - std::string key("h|"); - key.append(dimension); - key.push_back('\0'); - key.append(hash_value); - return key; -} - -std::string make_hash_reverse_key(std::string_view dimension, - std::string_view resolved_value, - std::string_view hash_value) { - std::string key("H|"); - key.append(dimension); - key.push_back('\0'); - key.append(resolved_value); - key.push_back('\0'); - key.append(hash_value); - return key; -} +using encoding::prefix_for_file; std::string make_dimension_key(int file_id, std::string_view dimension) { std::string key("d|"); @@ -185,88 +93,38 @@ std::string make_dimension_key(int file_id, std::string_view dimension) { return key; } -std::string chunk_bloom_key(int file_id, std::string_view dimension, - std::uint64_t checkpoint_idx) { - std::string key = prefix_for_file(file_id); - key.append(dimension); - key.push_back('\0'); - append_u64(key, checkpoint_idx); - return key; -} - std::string file_bloom_key(int file_id, std::string_view dimension) { std::string key = prefix_for_file(file_id); key.append(dimension); return key; } -std::string chunk_stats_key(int file_id, std::uint64_t checkpoint_idx) { - std::string key = prefix_for_file(file_id); - append_u64(key, checkpoint_idx); - return key; +using encoding::metadata_key; +std::string file_scalar_stats_key(int file_id) { + return prefix_for_file(file_id); } - -std::string checkpoint_key(int file_id, std::uint64_t uc_offset, - std::uint64_t checkpoint_idx) { - std::string key = prefix_for_file(file_id); - append_u64(key, uc_offset); - append_u64(key, checkpoint_idx); - return key; +std::string file_category_counts_key(int file_id) { + return prefix_for_file(file_id); } - -std::string chunk_dim_stats_key(int file_id, std::uint64_t checkpoint_idx, - std::string_view dimension) { - std::string key = prefix_for_file(file_id); - append_u64(key, checkpoint_idx); - key.append(dimension); - return key; +std::string file_pid_tid_counts_key(int file_id) { + return prefix_for_file(file_id); } - -std::string manifest_event_key(int file_id, std::uint64_t checkpoint_idx, - std::string_view cat, std::string_view name) { - std::string key("E|"); - rocks::KeyCodec::append_be32(key, static_cast(file_id)); - append_u64(key, checkpoint_idx); - key.append(cat); - key.push_back('\0'); - key.append(name); - return key; +std::string file_name_counts_key(int file_id) { + return prefix_for_file(file_id); } +std::string root_scalar_stats_key() { return "_root"; } +std::string root_category_counts_key() { return "_root"; } +std::string root_name_counts_key() { return "_root"; } +std::string root_pid_tid_counts_key() { return "_root"; } -std::string manifest_metadata_key(int file_id, std::uint64_t checkpoint_idx, - std::string_view meta_type) { - std::string key("M|"); - rocks::KeyCodec::append_be32(key, static_cast(file_id)); - append_u64(key, checkpoint_idx); - key.append(meta_type); - return key; -} - -std::string metadata_key(int file_id) { return prefix_for_file(file_id); } - +using encoding::name_lookup_key; +using encoding::name_reverse_key; std::string tar_archive_key(int file_id) { return prefix_for_file(file_id); } -std::string tar_file_key(int file_id, std::uint64_t uncompressed_offset, - std::string_view file_name) { - std::string key = prefix_for_file(file_id); - append_u64(key, uncompressed_offset); - key.push_back('\0'); - key.append(file_name); - return key; -} - -std::string encode_bloom_value(std::span blob, - std::uint64_t num_entries) { - std::string value; - append_u64(value, num_entries); - value.append(reinterpret_cast(blob.data()), blob.size()); - return value; -} - -IndexDatabase::ChunkBloomResult decode_chunk_bloom(std::string_view key, - std::string_view value, - std::size_t prefix_size) { - IndexDatabase::ChunkBloomResult result; +ChunkBloomResult decode_chunk_bloom(std::string_view key, + std::string_view value, + std::size_t prefix_size) { + ChunkBloomResult result; auto checkpoint_pos = key.find('\0', prefix_size); if (checkpoint_pos == std::string_view::npos || checkpoint_pos + 1 + 8 > key.size()) { @@ -282,47 +140,19 @@ IndexDatabase::ChunkBloomResult decode_chunk_bloom(std::string_view key, return result; } -IndexDatabase::FileBloomResult decode_file_bloom(std::string_view value) { +FileBloomResult decode_file_bloom(std::string_view value) { if (value.size() < 8) { throw std::runtime_error("Corrupt file bloom value"); } - IndexDatabase::FileBloomResult result; + FileBloomResult result; result.num_entries = rocks::KeyCodec::decode_be64(value.substr(0, 8)); result.bloom_data.assign(value.begin() + 8, value.end()); return result; } -std::string encode_chunk_statistics_value( - const IndexDatabase::ChunkStatistics& stats) { - std::string value; - append_u64(value, stats.total_events); - append_u64(value, stats.min_timestamp_us); - append_u64(value, stats.max_timestamp_us); - append_i64(value, stats.duration_sum_us); - append_u64(value, stats.duration_min_us); - append_u64(value, stats.duration_max_us); - append_u64(value, stats.duration_count); - append_double(value, stats.duration_m2); - - auto duration_sketch = stats.duration_sketch.serialize(); - append_blob(value, duration_sketch); - - auto duration_histogram = stats.duration_histogram.to_json(); - append_string(value, duration_histogram); - - auto name_sketches = stats.serialize_name_duration_sketches(); - append_blob(value, name_sketches); - append_string(value, stats.name_duration_histograms_json()); - append_string(value, stats.name_duration_sums_json()); - append_string(value, stats.name_duration_sum_sqs_json()); - append_string(value, stats.name_category_json()); - return value; -} - -IndexDatabase::ChunkStatistics decode_chunk_statistics_value( - std::string_view value) { +ChunkStatistics decode_chunk_statistics_value(std::string_view value) { Cursor cursor(value); - IndexDatabase::ChunkStatistics stats; + ChunkStatistics stats; stats.total_events = cursor.u64(); stats.min_timestamp_us = cursor.u64(); stats.max_timestamp_us = cursor.u64(); @@ -347,42 +177,35 @@ IndexDatabase::ChunkStatistics decode_chunk_statistics_value( auto name_sketches = cursor.blob(); if (!name_sketches.empty()) { stats.name_duration_sketches = - IndexDatabase::ChunkStatistics::deserialize_name_duration_sketches( + ChunkStatistics::deserialize_name_duration_sketches( name_sketches.data(), name_sketches.size()); } stats.name_duration_histograms = - IndexDatabase::ChunkStatistics::parse_histogram_map_json(cursor.str()); + ChunkStatistics::parse_histogram_map_json(cursor.str()); stats.name_duration_sums = - IndexDatabase::ChunkStatistics::parse_double_map_json(cursor.str()); + ChunkStatistics::parse_double_map_json(cursor.str()); stats.name_duration_sum_sqs = - IndexDatabase::ChunkStatistics::parse_double_map_json(cursor.str()); - stats.name_category = - IndexDatabase::ChunkStatistics::parse_string_map_json(cursor.str()); - return stats; -} + ChunkStatistics::parse_double_map_json(cursor.str()); + stats.name_category = ChunkStatistics::parse_string_map_json(cursor.str()); -std::string encode_checkpoint_value( - const IndexDatabase::IndexerCheckpoint& checkpoint) { - std::string value; - append_u64(value, checkpoint.uc_size); - append_u64(value, checkpoint.c_offset); - append_u64(value, checkpoint.c_size); - append_i64(value, checkpoint.bits); - append_blob(value, checkpoint.dict_compressed); - append_u64(value, checkpoint.num_lines); - append_u64(value, checkpoint.first_line_num); - append_u64(value, checkpoint.last_line_num); - return value; + auto ts_hist_blob = cursor.blob(); + if (!ts_hist_blob.empty()) { + stats.timestamp_histogram = + common::statistics::TimestampHistogram::deserialize( + ts_hist_blob.data(), ts_hist_blob.size()); + } + + return stats; } -IndexDatabase::IndexerCheckpoint decode_checkpoint(std::string_view key, - std::string_view value) { +IndexerCheckpoint decode_checkpoint(std::string_view key, + std::string_view value) { if (key.size() < 20) { throw std::runtime_error("Corrupt checkpoint key"); } - IndexDatabase::IndexerCheckpoint checkpoint; + IndexerCheckpoint checkpoint; checkpoint.uc_offset = rocks::KeyCodec::decode_be64(key.substr(4, 8)); checkpoint.checkpoint_idx = rocks::KeyCodec::decode_be64(key.substr(12, 8)); @@ -398,25 +221,9 @@ IndexDatabase::IndexerCheckpoint decode_checkpoint(std::string_view key, return checkpoint; } -std::string encode_chunk_dimension_stats_value( - const IndexDatabase::ChunkDimensionStats& stats, - std::size_t value_counts_cap) { - std::string value; - append_u64(value, stats.distinct_count); - append_string(value, stats.min_value); - append_string(value, stats.max_value); - append_string(value, stats.value_type); - auto compressed = stats.compress_value_counts(value_counts_cap); - append_u8(value, compressed.has_value() ? 1 : 0); - if (compressed) { - append_blob(value, *compressed); - } - return value; -} - -IndexDatabase::ChunkDimensionStatsResult decode_chunk_dimension_stats_value( +ChunkDimensionStatsResult decode_chunk_dimension_stats_value( std::string_view key, std::string_view value) { - IndexDatabase::ChunkDimensionStatsResult result; + ChunkDimensionStatsResult result; if (key.size() < 12) { throw std::runtime_error("Corrupt chunk dimension stats key"); } @@ -430,63 +237,71 @@ IndexDatabase::ChunkDimensionStatsResult decode_chunk_dimension_stats_value( result.value_type = cursor.str(); if (cursor.u8() != 0) { auto compressed = cursor.blob(); - result.value_counts = - IndexDatabase::ChunkDimensionStats::decompress_value_counts( - compressed.data(), compressed.size()); + // Defer decompression + result.compressed_value_counts.assign(compressed.begin(), + compressed.end()); } return result; } -std::string encode_event_range_value(std::span lines) { - std::vector vec(lines.begin(), lines.end()); - auto blob = queries::pack_line_numbers(vec); - std::string value; - append_u64(value, vec.size()); - append_blob(value, blob); - return value; -} - std::vector decode_line_numbers(Cursor& cursor) { auto blob = cursor.blob(); return queries::unpack_line_numbers(blob.data(), blob.size()); } -std::string encode_metadata_value(std::span lines) { - std::vector vec(lines.begin(), lines.end()); - auto blob = queries::pack_line_numbers(vec); - std::string value; - append_blob(value, blob); - return value; +StringViewMap decode_count_map_value(std::string_view value) { + Cursor cursor(value); + StringViewMap counts; + auto num_entries = cursor.u32(); + counts.reserve(num_entries); + for (std::uint32_t i = 0; i < num_entries; ++i) { + auto key = cursor.str(); + counts.emplace(std::move(key), cursor.u64()); + } + return counts; } -std::string encode_metadata_record(std::uint64_t checkpoint_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size) { - std::string value; - append_u64(value, checkpoint_size); - append_u64(value, total_lines); - append_u64(value, total_uc_size); - return value; +NameSummaryResult decode_name_summary_value(std::string_view value) { + Cursor cursor(value); + NameSummaryResult result; + auto num_entries = cursor.u32(); + result.other_count = cursor.u64(); + result.unique_count = cursor.u64(); + result.counts.reserve(num_entries); + for (std::uint32_t i = 0; i < num_entries; ++i) { + auto key = cursor.str(); + result.counts.emplace(std::move(key), cursor.u64()); + } + return result; } -std::string encode_tar_archive_value(std::string_view archive_name, - std::uint64_t checkpoint_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size, - std::uint64_t total_files) { - std::string value; - append_string(value, archive_name); - append_u64(value, checkpoint_size); - append_u64(value, total_lines); - append_u64(value, total_uc_size); - append_u64(value, total_files); - return value; +template +void for_each_count_map_entry(std::string_view value, Callback&& callback) { + Cursor cursor(value); + auto num_entries = cursor.u32(); + for (std::uint32_t i = 0; i < num_entries; ++i) { + auto key = cursor.str_view(); + auto count = cursor.u64(); + callback(key, count); + } } -IndexDatabase::TarArchiveMetadata decode_tar_archive_value( - std::string_view value) { +template +void for_each_name_summary_entry(std::string_view value, Callback&& callback) { Cursor cursor(value); - IndexDatabase::TarArchiveMetadata metadata; + auto num_entries = cursor.u32(); + (void)cursor.u64(); // other_count + (void)cursor.u64(); // unique_count + for (std::uint32_t i = 0; i < num_entries; ++i) { + auto key = cursor.str_view(); + auto count = cursor.u64(); + callback(key, count); + } +} + +TarArchiveMetadata decode_tar_archive_value(std::string_view value) { + Cursor cursor(value); + TarArchiveMetadata metadata; metadata.archive_name = cursor.str(); metadata.checkpoint_size = cursor.u64(); metadata.total_lines = cursor.u64(); @@ -495,17 +310,7 @@ IndexDatabase::TarArchiveMetadata decode_tar_archive_value( return metadata; } -std::string encode_tar_file_value(const IndexDatabase::TarFileRecord& record) { - std::string value; - append_u64(value, record.file_size); - append_u64(value, record.file_mtime); - append_u8(value, static_cast(record.typeflag)); - append_u64(value, record.data_offset); - return value; -} - -IndexDatabase::TarFileRecord decode_tar_file(std::string_view key, - std::string_view value) { +TarFileRecord decode_tar_file(std::string_view key, std::string_view value) { if (key.size() < 13) { throw std::runtime_error("Corrupt tar file key"); } @@ -516,7 +321,7 @@ IndexDatabase::TarFileRecord decode_tar_file(std::string_view key, } Cursor cursor(value); - IndexDatabase::TarFileRecord record; + TarFileRecord record; record.uncompressed_offset = rocks::KeyCodec::decode_be64(key.substr(4, 8)); record.file_name = std::string(key.substr(name_pos + 1)); record.file_size = cursor.u64(); @@ -551,22 +356,287 @@ void scan_prefix(const rocks::RocksDatabase& db, std::string_view column_family, } // namespace +namespace { + +/// Register merge operators for the AGGREGATION and SYSTEM_METRICS CFs on +/// every IndexDatabase open. Previously these operators were set only on +/// the separate handle returned by EventAggregator::open_with_merge_operator, +/// which meant the main IndexDatabase did NOT know how to combine merge +/// operands. Ingested SSTs from the distributed pipeline rely on the +/// operator being registered on the first opener of a given DB path +/// (RocksDBManager caches one instance per path, so later callers get the +/// same handle with these operators already configured). +::rocksdb::CompressionType select_compression_type() { +#ifdef DFTRACER_UTILS_ENABLE_ZSTD + return ::rocksdb::kZSTD; +#elif defined(DFTRACER_UTILS_ENABLE_LZ4) + return ::rocksdb::kLZ4Compression; +#else + return ::rocksdb::kZlibCompression; +#endif +} + +rocks::RocksDatabase::CfOptionsOverride make_aggregation_cf_override() { + using dftracer::utils::utilities::composites::dft::aggregators:: + AggregationMergeOperator; + using dftracer::utils::utilities::composites::dft::aggregators:: + SystemMetricsMergeOperator; + auto agg_merge_op = std::make_shared(); + auto sys_merge_op = std::make_shared(); + return [agg_merge_op, sys_merge_op](const std::string& cf_name, + ::rocksdb::ColumnFamilyOptions& opts) { + if (cf_name == cf::AGGREGATION) { + opts.merge_operator = agg_merge_op; + ::rocksdb::BlockBasedTableOptions bbt; + bbt.block_size = 32 * 1024; + bbt.format_version = 5; + bbt.index_block_restart_interval = 16; + bbt.whole_key_filtering = false; + opts.table_factory.reset(::rocksdb::NewBlockBasedTableFactory(bbt)); + opts.level0_file_num_compaction_trigger = 2; + opts.max_bytes_for_level_multiplier = 20; + opts.compression = select_compression_type(); + opts.bottommost_compression = select_compression_type(); + } else if (cf_name == cf::SYSTEM_METRICS) { + opts.merge_operator = sys_merge_op; + opts.compression = select_compression_type(); + opts.bottommost_compression = select_compression_type(); + } + }; +} + +} // namespace + IndexDatabase::IndexDatabase(const std::string& index_path, rocks::RocksDatabase::OpenMode open_mode) : db_path_(internal::normalize_index_root(index_path)), open_mode_(open_mode), - db_(rocks::RocksDBManager::instance().get_or_open(db_path_, open_mode_)) { + db_(rocks::RocksDBManager::instance().get_or_open( + db_path_, open_mode_, make_aggregation_cf_override())) { if (open_mode_ == rocks::RocksDatabase::OpenMode::ReadWrite) { - init_base_schema(); + init_schema(); + } +} + +std::unique_ptr IndexDatabase::begin_write() { + return std::unique_ptr( + new IndexDatabaseWriterContext(db_)); +} + +void IndexDatabase::bulk_ingest( + const SstArtifactRegistry& registry, + const std::unordered_set& skip_cfs) { + const auto skipped = [&](std::string_view cf_name) { + return skip_cfs.find(std::string(cf_name)) != skip_cfs.end(); + }; + const auto ingest = [&](std::string_view cf_name, + const std::vector& files) { + if (skipped(cf_name)) return; + auto status = db_->ingest_external_files(cf_name, files, + /*ingest_behind=*/false); + if (!status.ok()) { + throw_db_error("Failed to ingest SSTs into column family '" + + std::string(cf_name) + "'", + status); + } + }; + + ingest(cf::METADATA, registry.metadata()); + ingest(cf::CHECKPOINTS, registry.checkpoints()); + ingest(cf::MANIFEST, registry.manifest()); + ingest(cf::CHUNK_BLOOM, registry.chunk_bloom()); + ingest(cf::FILE_BLOOM, registry.file_bloom()); + ingest(cf::CHUNK_STATS, registry.chunk_stats()); + ingest(cf::CHUNK_DIM_STATS, registry.chunk_dim_stats()); + ingest(cf::DIMENSIONS, registry.dimensions()); + ingest(cf::FILE_SCALAR_STATS, registry.file_scalar_stats()); + ingest(cf::FILE_CAT_COUNTS, registry.file_cat_counts()); + ingest(cf::FILE_PID_TID_COUNTS, registry.file_pid_tid_counts()); + ingest(cf::FILE_NAME_COUNTS, registry.file_name_counts()); + // Multiple workers emit identical (name_id, name) dictionary pairs for + // shared event names, so SSTs across workers have overlapping key ranges. + // Regular ingest forbids overlap *within a single call*, so we ingest one + // SST at a time. The content-addressed values are deterministic (same + // name -> same hash), so the normal LSM sequence-number semantics (later + // ingest shadows earlier with identical value) preserve correctness + // without requiring `ingest_behind`. + if (!skipped(cf::NAME_DICTIONARY)) { + for (const auto& path : registry.name_dictionary()) { + auto status = db_->ingest_external_files( + cf::NAME_DICTIONARY, {path}, /*ingest_behind=*/false); + if (!status.ok()) { + throw_db_error( + "Failed to ingest SST into column family 'name_dictionary'", + status); + } + } + } + ingest(cf::NAME_FILE_POSTINGS, registry.name_file_postings()); + ingest(cf::NAME_CHUNK_POSTINGS, registry.name_chunk_postings()); + // HASH_TABLES is content-addressed: same hash -> same name across workers. + // Same rationale as NAME_DICTIONARY: ingest one SST at a time so rocksdb + // can place overlapping files at L0 with new seqnos; deterministic values + // mean last-writer-wins resolves correctly. + if (!skipped(cf::HASH_TABLES)) { + for (const auto& path : registry.hash_tables()) { + auto status = db_->ingest_external_files(cf::HASH_TABLES, {path}, + /*ingest_behind=*/false); + if (!status.ok()) { + throw_db_error( + "Failed to ingest SST into column family 'hash_tables'", + status); + } + } + } + // AGGREGATION + SYSTEM_METRICS: workers emit mixed Put+Merge SSTs with + // overlapping (pid, time_bucket, ...) keys across workers. Ingest one + // SST at a time; the rocksdb merge_operator on these CFs collapses + // cross-worker merge operands at read/compaction time. + if (!skipped(cf::AGGREGATION)) { + for (const auto& path : registry.aggregation()) { + auto status = db_->ingest_external_files(cf::AGGREGATION, {path}, + /*ingest_behind=*/false); + if (!status.ok()) { + throw_db_error( + "Failed to ingest SST into column family 'aggregation'", + status); + } + } + } + if (!skipped(cf::SYSTEM_METRICS)) { + for (const auto& path : registry.system_metrics()) { + auto status = db_->ingest_external_files(cf::SYSTEM_METRICS, {path}, + /*ingest_behind=*/false); + if (!status.ok()) { + throw_db_error( + "Failed to ingest SST into column family 'system_metrics'", + status); + } + } + } +} + +void IndexDatabase::rebuild_root_summaries() { + auto writer = begin_write(); + writer->rebuild_root_summaries(); + writer->commit(); +} + +void IndexDatabase::write_agg_global_config(std::uint64_t time_interval_us, + std::uint32_t config_hash) { + using dftracer::utils::utilities::composites::dft::aggregators:: + AGG_GLOBAL_CONFIG_KEY; + using dftracer::utils::utilities::composites::dft::aggregators:: + AggGlobalConfig; + using dftracer::utils::utilities::composites::dft::aggregators:: + serialize_agg_global_config; + + AggGlobalConfig cfg; + cfg.time_interval_us = time_interval_us; + cfg.config_hash = config_hash; + auto status = db_->put(std::string_view(AGG_GLOBAL_CONFIG_KEY, 2), + serialize_agg_global_config(cfg), cf::AGGREGATION); + if (!status.ok()) { + throw_db_error("Failed to write aggregation global config", status); + } +} + +void IndexDatabase::write_aggregation_tracker( + const std::vector& blobs) { + using dftracer::utils::utilities::composites::dft::aggregators:: + AssociationTracker; + + AssociationTracker unified; + for (const auto& b : blobs) { + if (b.empty()) continue; + unified.merge(AssociationTracker::deserialize(b)); + } + unified.finalize(); + constexpr std::string_view TRACKER_KEY = "__tracker__"; + auto status = db_->put(TRACKER_KEY, unified.serialize(), cf::AGGREGATION); + if (!status.ok()) { + throw_db_error("Failed to write aggregation tracker", status); + } +} + +void IndexDatabase::write_agg_file_markers(const std::vector& file_ids) { + using dftracer::utils::utilities::composites::dft::aggregators:: + make_agg_file_key; + + auto batch = db_->begin_batch(); + for (int file_id : file_ids) { + if (file_id < 0) continue; + db_->put(batch, cf::AGGREGATION, + make_agg_file_key(static_cast(file_id)), ""); + } + auto status = db_->commit_batch(batch); + if (!status.ok()) { + throw_db_error("Failed to write aggregation file markers", status); + } +} + +std::vector IndexDatabase::register_files( + const std::vector& file_paths, bool build_manifest) { + IndexFileEntryCapability caps = IndexFileEntryCapability::BLOOM | + IndexFileEntryCapability::CHECKPOINTS | + IndexFileEntryCapability::FILE_SUMMARY | + IndexFileEntryCapability::INDEXING_COMPLETE; + if (build_manifest) { + caps |= IndexFileEntryCapability::MANIFEST; + } + + std::vector ids; + ids.reserve(file_paths.size()); + auto writer = begin_write(); + for (const auto& path : file_paths) { + const auto logical = internal::get_logical_path(path); + const auto file_hash = internal::calculate_file_hash(path); + ids.push_back( + writer->get_or_create_file_info(logical, file_hash, caps)); } + writer->commit(); + return ids; } -void IndexDatabase::init_base_schema() { +int IndexDatabase::reserve_file_id_range(std::size_t count) { + if (count == 0) { + // Return the next id without advancing the counter. + std::string value; + const auto key = std::string(encoding::NEXT_FILE_ID_KEY); + auto status = db_->get(key, &value); + if (status.IsNotFound()) return 1; + if (!status.ok()) { + throw_db_error("Failed to read next file id", status); + } + return static_cast(rocks::KeyCodec::decode_be32(value)); + } + + std::string value; + const auto key = std::string(encoding::NEXT_FILE_ID_KEY); + auto status = db_->get(key, &value); + + std::uint32_t first = 1; + if (status.ok()) { + first = rocks::KeyCodec::decode_be32(value); + } else if (!status.IsNotFound()) { + throw_db_error("Failed to read next file id", status); + } + + const std::uint32_t next = first + static_cast(count); + const auto encoded = rocks::KeyCodec::encode_be32(next); + auto put_status = db_->put(key, encoded); + if (!put_status.ok()) { + throw_db_error("Failed to advance next file id", put_status); + } + return static_cast(first); +} + +void IndexDatabase::init_schema() { std::string value; auto status = db_->get(schema_version_key(), &value); if (status.IsNotFound()) { status = db_->put(schema_version_key(), - rocks::KeyCodec::encode_be32(kSchemaVersion)); + rocks::KeyCodec::encode_be32(SCHEMA_VERSION)); if (!status.ok()) { throw_db_error("Failed to initialize schema version", status); } @@ -575,404 +645,844 @@ void IndexDatabase::init_base_schema() { } } -void IndexDatabase::init_bloom_schema() { - // RocksDB column families are provisioned at DB open; bloom-specific - // schema initialization is intentionally a no-op. -} - -void IndexDatabase::init_manifest_schema() { - // RocksDB column families are provisioned at DB open; manifest-specific - // schema initialization is intentionally a no-op. -} - bool IndexDatabase::has_bloom_data(int file_id) const { + auto caps = get_file_capabilities(file_id); + if (has_capability(caps, IndexFileEntryCapability::BLOOM)) return true; bool found = false; auto prefix = prefix_for_file(file_id); - scan_prefix(*db_, "chunk_bloom", prefix, + scan_prefix(*db_, cf::CHUNK_BLOOM, prefix, [&found](::rocksdb::Iterator&) { found = true; }); return found; } bool IndexDatabase::has_manifest_data(int file_id) const { + auto caps = get_file_capabilities(file_id); + if (has_capability(caps, IndexFileEntryCapability::MANIFEST)) return true; bool found = false; std::string prefix("E|"); rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); - scan_prefix(*db_, "manifest", prefix, + scan_prefix(*db_, cf::MANIFEST, prefix, [&found](::rocksdb::Iterator&) { found = true; }); return found; } -int IndexDatabase::get_or_create_file_info(std::string_view path, - std::uint64_t file_hash) { - const auto logical_name = std::string(path); - const auto lookup = file_lookup_key(logical_name); - std::string existing; - auto status = db_->get(lookup, &existing); - if (status.ok()) { - const auto file_id = decode_file_id(existing); - if (decode_file_hash(existing) == file_hash) { - return file_id; +IndexFileEntryCapability IndexDatabase::get_file_capabilities( + int file_id) const { + std::string name; + auto status = db_->get(file_reverse_key(file_id), &name); + if (!status.ok()) return IndexFileEntryCapability::NONE; + + std::string record; + status = db_->get(file_lookup_key(name), &record); + if (!status.ok()) return IndexFileEntryCapability::NONE; + + return decode_file_capabilities(record); +} + +int IndexDatabase::get_file_info_id(std::string_view path) const { + std::string value; + auto status = db_->get(file_lookup_key(path), &value); + if (status.IsNotFound()) { + return -1; + } + if (!status.ok()) { + throw_db_error("Failed to look up file info id", status); + } + return decode_file_id(value); +} + +std::optional IndexDatabase::get_file_hash( + std::string_view path) const { + std::string value; + auto status = db_->get(file_lookup_key(path), &value); + if (status.IsNotFound()) { + return std::nullopt; + } + if (!status.ok()) { + throw_db_error("Failed to look up file hash", status); + } + return decode_file_hash(value); +} + +std::unordered_map IndexDatabase::query_all_file_info_ids() + const { + std::unordered_map results; + internal::scan_prefix_iterator( + "Failed to scan file registry", "f|", + [this] { return db_->new_iterator(); }, + [&](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + auto value = iterator_value(it); + results.emplace(key.substr(2), decode_file_id(value)); + }); + return results; +} + +std::unordered_map +IndexDatabase::query_all_file_registry() const { + std::unordered_map results; + internal::scan_prefix_iterator( + "Failed to scan file registry", "f|", + [this] { return db_->new_iterator(); }, + [&](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + auto value = iterator_value(it); + FileRegistryEntry entry; + entry.file_id = decode_file_id(value); + entry.capabilities = decode_file_capabilities(value); + results.emplace(key.substr(2), entry); + }); + return results; +} + +std::unordered_set IndexDatabase::query_files_with_file_scalar_stats() + const { + std::unordered_set results; + auto it = db_->new_iterator(cf::FILE_SCALAR_STATS); + for (it->SeekToFirst(); it->Valid();) { + auto key = iterator_key(*it); + int file_id = decode_prefixed_file_id(key); + results.insert(file_id); + if (file_id == std::numeric_limits::max()) { + break; } - delete_file_data(file_id); - auto registry = encode_file_record(file_id, file_hash); - if (txn_batch_) { - status = db_->put(*txn_batch_, "default", lookup, registry); - if (!status.ok()) { - throw_db_error("Failed to update file registry", status); - } - status = db_->put(*txn_batch_, "default", file_reverse_key(file_id), - logical_name); - if (!status.ok()) { - throw_db_error("Failed to update reverse file registry", - status); - } + auto next_prefix = prefix_for_file(file_id + 1); + it->Seek(::rocksdb::Slice(next_prefix.data(), next_prefix.size())); + } + + const auto status = it->status(); + if (!status.ok()) { + throw IndexerError( + IndexerError::Type::DATABASE_ERROR, + "Failed to scan file scalar stats: " + status.ToString()); + } + + return results; +} + +std::unordered_set IndexDatabase::query_files_with_bloom_data() const { + std::unordered_set results; + auto it = db_->new_iterator(cf::CHUNK_BLOOM); + for (it->SeekToFirst(); it->Valid();) { + auto key = iterator_key(*it); + int file_id = decode_prefixed_file_id(key); + results.insert(file_id); + if (file_id == std::numeric_limits::max()) { + break; + } + auto next_prefix = prefix_for_file(file_id + 1); + it->Seek(::rocksdb::Slice(next_prefix.data(), next_prefix.size())); + } + + const auto status = it->status(); + if (!status.ok()) { + throw IndexerError(IndexerError::Type::DATABASE_ERROR, + "Failed to scan bloom data: " + status.ToString()); + } + return results; +} + +int IndexDatabase::find_file(std::string_view file_path) const { + return get_file_info_id(internal::get_logical_path(file_path)); +} + +std::optional IndexDatabase::query_name_id( + std::string_view name) const { + std::string value; + auto status = db_->get(name_lookup_key(name), &value, cf::NAME_DICTIONARY); + if (status.IsNotFound()) { + return std::nullopt; + } + if (!status.ok()) { + throw_db_error("Failed to query name dictionary", status); + } + return rocks::KeyCodec::decode_be64(value); +} + +std::optional IndexDatabase::query_name_by_id( + std::uint64_t name_id) const { + std::string value; + auto status = + db_->get(name_reverse_key(name_id), &value, cf::NAME_DICTIONARY); + if (status.IsNotFound()) { + return std::nullopt; + } + if (!status.ok()) { + throw_db_error("Failed to query name reverse dictionary", status); + } + return value; +} + +bool IndexDatabase::has_file_scalar_stats(int file_id) const { + std::string value; + auto status = + db_->get(file_scalar_stats_key(file_id), &value, cf::FILE_SCALAR_STATS); + if (status.IsNotFound()) { + return false; + } + if (!status.ok()) { + throw_db_error("Failed to check file scalar statistics", status); + } + return true; +} + +std::vector IndexDatabase::query_chunk_bloom_filters( + int file_id, std::string_view dimension) const { + std::vector results; + std::string prefix = prefix_for_file(file_id); + prefix.append(dimension); + prefix.push_back('\0'); + scan_prefix(*db_, cf::CHUNK_BLOOM, prefix, [&](::rocksdb::Iterator& it) { + results.push_back(decode_chunk_bloom( + iterator_key(it), iterator_value(it), prefix.size() - 1)); + }); + return results; +} + +std::unordered_map> +IndexDatabase::query_chunk_bloom_filters_batch( + int file_id, const std::vector& dimensions) const { + std::unordered_map> results; + for (const auto& dimension : dimensions) { + results.emplace(dimension, + query_chunk_bloom_filters(file_id, dimension)); + } + return results; +} + +std::optional IndexDatabase::query_file_bloom_filter( + int file_id, std::string_view dimension) const { + std::string value; + auto status = + db_->get(file_bloom_key(file_id, dimension), &value, cf::FILE_BLOOM); + if (status.IsNotFound()) { + return std::nullopt; + } + if (!status.ok()) { + throw_db_error("Failed to query file bloom filter", status); + } + return decode_file_bloom(value); +} + +std::unordered_map +IndexDatabase::query_file_bloom_filters_batch( + int file_id, const std::vector& dimensions) const { + std::unordered_map results; + for (const auto& dimension : dimensions) { + auto bloom = query_file_bloom_filter(file_id, dimension); + if (bloom) { + results.emplace(dimension, std::move(*bloom)); + } + } + return results; +} + +std::vector IndexDatabase::query_index_dimensions( + int file_id) const { + std::vector dimensions; + std::string prefix("d|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, cf::DIMENSIONS, prefix, [&](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + dimensions.push_back(key.substr(prefix.size())); + }); + return dimensions; +} + +bool IndexDatabase::has_index_dimension(int file_id, + std::string_view dimension) const { + std::string value; + return db_ + ->get(make_dimension_key(file_id, dimension), &value, cf::DIMENSIONS) + .ok(); +} + +std::vector IndexDatabase::query_chunk_statistics( + int file_id) const { + std::vector results; + const auto prefix = prefix_for_file(file_id); + scan_prefix(*db_, cf::CHUNK_STATS, prefix, [&](::rocksdb::Iterator& it) { + ChunkStatisticsResult result; + auto key = iterator_key(it); + result.checkpoint_idx = + rocks::KeyCodec::decode_be64(std::string_view(key).substr(4, 8)); + result.stats = decode_chunk_statistics_value(iterator_value(it)); + results.push_back(std::move(result)); + }); + std::sort(results.begin(), results.end(), + [](const auto& lhs, const auto& rhs) { + return lhs.checkpoint_idx < rhs.checkpoint_idx; + }); + return results; +} + +std::unordered_map> +IndexDatabase::query_chunk_statistics_batch( + const std::vector& file_ids) const { + std::unordered_map> results; + if (file_ids.empty()) { + return results; + } + results.reserve(file_ids.size()); + + std::unordered_set wanted(file_ids.begin(), file_ids.end()); + const auto [min_it, max_it] = + std::minmax_element(file_ids.begin(), file_ids.end()); + const auto min_prefix = prefix_for_file(*min_it); + const int max_file_id = *max_it; + + auto it = db_->new_iterator(cf::CHUNK_STATS); + for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size())); + it->Valid(); it->Next()) { + auto key = iterator_key(*it); + int file_id = decode_prefixed_file_id(key); + if (file_id > max_file_id) { + break; + } + if (!wanted.contains(file_id)) { + continue; + } + + ChunkStatisticsResult result; + result.checkpoint_idx = + rocks::KeyCodec::decode_be64(std::string_view(key).substr(4, 8)); + result.stats = decode_chunk_statistics_value(iterator_value(*it)); + results[file_id].push_back(std::move(result)); + } + + const auto status = it->status(); + if (!status.ok()) { + throw IndexerError( + IndexerError::Type::DATABASE_ERROR, + "Failed to batch query chunk statistics: " + status.ToString()); + } + + for (auto& [_, entries] : results) { + std::sort(entries.begin(), entries.end(), + [](const auto& lhs, const auto& rhs) { + return lhs.checkpoint_idx < rhs.checkpoint_idx; + }); + } + return results; +} + +std::unordered_map +IndexDatabase::query_merged_statistics_batch( + const std::vector& file_ids) const { + std::unordered_map results; + if (file_ids.empty()) { + return results; + } + results.reserve(file_ids.size()); + + std::unordered_set wanted(file_ids.begin(), file_ids.end()); + const auto [min_it, max_it] = + std::minmax_element(file_ids.begin(), file_ids.end()); + const auto min_prefix = prefix_for_file(*min_it); + const int max_file_id = *max_it; + + auto stats_it = db_->new_iterator(cf::CHUNK_STATS); + for (stats_it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size())); + stats_it->Valid(); stats_it->Next()) { + auto key = iterator_key(*stats_it); + int file_id = decode_prefixed_file_id(key); + if (file_id > max_file_id) { + break; + } + if (!wanted.contains(file_id)) { + continue; + } + + auto decoded = decode_chunk_statistics_value(iterator_value(*stats_it)); + auto& merged = results[file_id]; + if (merged.num_chunks == 0) { + merged.stats = std::move(decoded); } else { - status = db_->put(lookup, registry); - if (!status.ok()) { - throw_db_error("Failed to update file registry", status); + merged.stats.merge_from(decoded); + } + ++merged.num_chunks; + } + + auto stats_status = stats_it->status(); + if (!stats_status.ok()) { + throw IndexerError(IndexerError::Type::DATABASE_ERROR, + "Failed to batch merge chunk statistics: " + + stats_status.ToString()); + } + + auto dims_it = db_->new_iterator(cf::CHUNK_DIM_STATS); + for (dims_it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size())); + dims_it->Valid(); dims_it->Next()) { + auto key = iterator_key(*dims_it); + int file_id = decode_prefixed_file_id(key); + if (file_id > max_file_id) { + break; + } + if (!wanted.contains(file_id)) { + continue; + } + + auto decoded = + decode_chunk_dimension_stats_value(key, iterator_value(*dims_it)); + if (!decoded.has_value_counts_payload()) continue; + decoded.ensure_value_counts_decoded(); + if (!decoded.value_counts) continue; + + auto& merged = results[file_id].stats; + if (decoded.dimension == "cat") { + for (const auto& [k, v] : *decoded.value_counts) { + merged.category_counts[k] += v; } - status = db_->put(file_reverse_key(file_id), logical_name); - if (!status.ok()) { - throw_db_error("Failed to update reverse file registry", - status); + } else if (decoded.dimension == "name") { + for (const auto& [k, v] : *decoded.value_counts) { + merged.name_counts[k] += v; + } + } else if (decoded.dimension == "pid_tid") { + for (const auto& [k, v] : *decoded.value_counts) { + merged.pid_tid_counts[k] += v; } } - return file_id; } - if (!status.IsNotFound()) { - throw_db_error("Failed to query file registry", status); + + auto dims_status = dims_it->status(); + if (!dims_status.ok()) { + throw IndexerError(IndexerError::Type::DATABASE_ERROR, + "Failed to batch merge chunk dimension stats: " + + dims_status.ToString()); } - std::uint32_t next_id = 1; - std::string next_value; - status = db_->get(next_file_id_key(), &next_value); - if (status.ok()) { - next_id = rocks::KeyCodec::decode_be32(next_value); - } else if (!status.IsNotFound()) { - throw_db_error("Failed to read next file id", status); + return results; +} + +std::unordered_map +IndexDatabase::query_file_scalar_stats_batch( + const std::vector& file_ids) const { + std::unordered_map results; + results.reserve(file_ids.size()); + for (const auto file_id : file_ids) { + std::string value; + auto status = db_->get(file_scalar_stats_key(file_id), &value, + cf::FILE_SCALAR_STATS); + if (status.IsNotFound()) { + continue; + } + if (!status.ok()) { + throw_db_error("Failed to read file scalar statistics", status); + } + try { + DecodeContextGuard ctx("file_scalar_stats file_id=%d size=%zu", + file_id, value.size()); + results.emplace(file_id, decode_file_scalar_stats_value(value)); + } catch (const std::exception& e) { + throw std::runtime_error( + "Corrupt file_scalar_stats payload file_id=" + + std::to_string(file_id) + + " size=" + std::to_string(value.size()) + ": " + e.what()); + } } + return results; +} - const auto file_id = static_cast(next_id); - const auto new_registry = encode_file_record(file_id, file_hash); - const auto next_registry = rocks::KeyCodec::encode_be32(next_id + 1); +std::unordered_map +IndexDatabase::query_file_metadata_batch( + const std::vector& file_ids) const { + std::unordered_map results; + if (file_ids.empty()) { + return results; + } + results.reserve(file_ids.size()); + + std::unordered_set wanted(file_ids.begin(), file_ids.end()); + const auto [min_it, max_it] = + std::minmax_element(file_ids.begin(), file_ids.end()); + const auto min_prefix = prefix_for_file(*min_it); + const int max_file_id = *max_it; + + auto it = db_->new_iterator(cf::METADATA); + for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size())); + it->Valid(); it->Next()) { + auto key = iterator_key(*it); + int file_id = decode_prefixed_file_id(key); + if (file_id > max_file_id) { + break; + } + if (!wanted.contains(file_id)) { + continue; + } - if (txn_batch_) { - status = db_->put(*txn_batch_, "default", lookup, new_registry); + auto value = iterator_value(*it); + DecodeContextGuard ctx("metadata file_id=%d size=%zu", file_id, + value.size()); + auto decoded = decode_metadata_record(value); + auto& meta = results[file_id]; + meta.checkpoint_size = decoded[0]; + meta.num_lines = decoded[1]; + meta.max_bytes = decoded[2]; + } + + const auto status = it->status(); + if (!status.ok()) { + throw IndexerError( + IndexerError::Type::DATABASE_ERROR, + "Failed to batch read file metadata: " + status.ToString()); + } + return results; +} + +std::unordered_map> +IndexDatabase::query_file_category_counts_batch( + const std::vector& file_ids) const { + std::unordered_map> results; + results.reserve(file_ids.size()); + for (const auto file_id : file_ids) { + std::string value; + auto status = db_->get(file_category_counts_key(file_id), &value, + cf::FILE_CAT_COUNTS); + if (status.IsNotFound()) { + continue; + } if (!status.ok()) { - throw_db_error("Failed to insert file registry", status); + throw_db_error("Failed to read file category counts", status); + } + try { + DecodeContextGuard ctx("file_cat_counts file_id=%d size=%zu", + file_id, value.size()); + results.emplace(file_id, decode_count_map_value(value)); + } catch (const std::exception& e) { + throw std::runtime_error( + "Corrupt file_cat_counts payload file_id=" + + std::to_string(file_id) + + " size=" + std::to_string(value.size()) + ": " + e.what()); + } + } + return results; +} + +void IndexDatabase::merge_file_category_counts_batch_into( + const std::vector& file_ids, + std::unordered_map& targets) const { + for (const auto file_id : file_ids) { + auto target_it = targets.find(file_id); + if (target_it == targets.end() || target_it->second == nullptr) { + continue; + } + + std::string value; + auto status = db_->get(file_category_counts_key(file_id), &value, + cf::FILE_CAT_COUNTS); + if (status.IsNotFound()) { + continue; } - status = db_->put(*txn_batch_, "default", file_reverse_key(file_id), - logical_name); if (!status.ok()) { - throw_db_error("Failed to insert reverse file registry", status); + throw_db_error("Failed to read file category counts", status); + } + + auto* stats = target_it->second; + DecodeContextGuard ctx("file_cat_counts merge file_id=%d size=%zu", + file_id, value.size()); + for_each_count_map_entry( + value, [stats](std::string_view key, std::uint64_t count) { + auto entry = + stats->category_counts.try_emplace(std::string(key), 0); + entry.first->second += count; + }); + } +} + +std::unordered_map> +IndexDatabase::query_file_pid_tid_counts_batch( + const std::vector& file_ids) const { + std::unordered_map> results; + results.reserve(file_ids.size()); + for (const auto file_id : file_ids) { + std::string value; + auto status = db_->get(file_pid_tid_counts_key(file_id), &value, + cf::FILE_PID_TID_COUNTS); + if (status.IsNotFound()) { + continue; } - status = - db_->put(*txn_batch_, "default", next_file_id_key(), next_registry); if (!status.ok()) { - throw_db_error("Failed to update next file id", status); + throw_db_error("Failed to read file pid_tid counts", status); + } + try { + DecodeContextGuard ctx("file_pid_tid_counts file_id=%d size=%zu", + file_id, value.size()); + results.emplace(file_id, decode_count_map_value(value)); + } catch (const std::exception& e) { + throw std::runtime_error( + "Corrupt file_pid_tid_counts payload file_id=" + + std::to_string(file_id) + + " size=" + std::to_string(value.size()) + ": " + e.what()); + } + } + return results; +} + +std::unordered_map +IndexDatabase::query_file_name_summaries_batch( + const std::vector& file_ids) const { + std::unordered_map results; + results.reserve(file_ids.size()); + for (const auto file_id : file_ids) { + std::string value; + auto status = db_->get(file_name_counts_key(file_id), &value, + cf::FILE_NAME_COUNTS); + if (status.IsNotFound()) { + continue; } - } else { - status = db_->put(lookup, new_registry); if (!status.ok()) { - throw_db_error("Failed to insert file registry", status); + throw_db_error("Failed to read file name counts", status); + } + try { + DecodeContextGuard ctx("file_name_counts file_id=%d size=%zu", + file_id, value.size()); + results.emplace(file_id, decode_name_summary_value(value)); + } catch (const std::exception& e) { + throw std::runtime_error( + "Corrupt file_name_counts payload file_id=" + + std::to_string(file_id) + + " size=" + std::to_string(value.size()) + ": " + e.what()); + } + } + return results; +} + +void IndexDatabase::merge_file_pid_tid_counts_batch_into( + const std::vector& file_ids, + std::unordered_map& targets) const { + for (const auto file_id : file_ids) { + auto target_it = targets.find(file_id); + if (target_it == targets.end() || target_it->second == nullptr) { + continue; + } + + std::string value; + auto status = db_->get(file_pid_tid_counts_key(file_id), &value, + cf::FILE_PID_TID_COUNTS); + if (status.IsNotFound()) { + continue; } - status = db_->put(file_reverse_key(file_id), logical_name); if (!status.ok()) { - throw_db_error("Failed to insert reverse file registry", status); + throw_db_error("Failed to read file pid_tid counts", status); + } + + auto* stats = target_it->second; + DecodeContextGuard ctx("file_pid_tid_counts merge file_id=%d size=%zu", + file_id, value.size()); + for_each_count_map_entry(value, [stats](std::string_view key, + std::uint64_t count) { + auto entry = stats->pid_tid_counts.try_emplace(std::string(key), 0); + entry.first->second += count; + }); + } +} + +void IndexDatabase::merge_file_name_counts_batch_into( + const std::vector& file_ids, + std::unordered_map& targets) const { + for (const auto file_id : file_ids) { + auto target_it = targets.find(file_id); + if (target_it == targets.end() || target_it->second == nullptr) { + continue; + } + + std::string value; + auto status = db_->get(file_name_counts_key(file_id), &value, + cf::FILE_NAME_COUNTS); + if (status.IsNotFound()) { + continue; } - status = db_->put(next_file_id_key(), next_registry); if (!status.ok()) { - throw_db_error("Failed to update next file id", status); + throw_db_error("Failed to read file name counts", status); } + + auto* stats = target_it->second; + DecodeContextGuard ctx("file_name_counts merge file_id=%d size=%zu", + file_id, value.size()); + for_each_name_summary_entry(value, [stats](std::string_view key, + std::uint64_t count) { + auto entry = stats->name_counts.try_emplace(std::string(key), 0); + entry.first->second += count; + }); } - - return file_id; } -int IndexDatabase::get_file_info_id(std::string_view path) const { +std::optional IndexDatabase::query_root_scalar_stats() + const { std::string value; - auto status = db_->get(file_lookup_key(path), &value); + auto status = + db_->get(root_scalar_stats_key(), &value, cf::ROOT_SCALAR_STATS); if (status.IsNotFound()) { - return -1; + return std::nullopt; } if (!status.ok()) { - throw_db_error("Failed to look up file info id", status); + throw_db_error("Failed to read root scalar statistics", status); + } + try { + DecodeContextGuard ctx("root_scalar_stats size=%zu", value.size()); + return decode_root_scalar_stats_value(value); + } catch (const std::exception& e) { + throw std::runtime_error("Corrupt root_scalar_stats payload size=" + + std::to_string(value.size()) + ": " + + e.what()); } - return decode_file_id(value); } -std::optional IndexDatabase::get_file_hash( - std::string_view path) const { +StringViewMap IndexDatabase::query_root_category_counts() const { std::string value; - auto status = db_->get(file_lookup_key(path), &value); + auto status = + db_->get(root_category_counts_key(), &value, cf::ROOT_CAT_COUNTS); if (status.IsNotFound()) { - return std::nullopt; + return {}; } if (!status.ok()) { - throw_db_error("Failed to look up file hash", status); + throw_db_error("Failed to read root category counts", status); + } + try { + DecodeContextGuard ctx("root_cat_counts size=%zu", value.size()); + return decode_count_map_value(value); + } catch (const std::exception& e) { + throw std::runtime_error("Corrupt root_cat_counts payload size=" + + std::to_string(value.size()) + ": " + + e.what()); } - return decode_file_hash(value); -} - -int IndexDatabase::find_file(std::string_view file_path) const { - return get_file_info_id(internal::get_logical_path(file_path)); -} - -void IndexDatabase::begin_transaction() { - txn_batch_ = - std::make_unique(db_->begin_batch()); } -void IndexDatabase::commit_transaction() { - if (!txn_batch_) { - return; +StringViewMap IndexDatabase::query_root_pid_tid_counts() const { + std::string value; + auto status = + db_->get(root_pid_tid_counts_key(), &value, cf::ROOT_PID_TID_COUNTS); + if (status.IsNotFound()) { + return {}; } - auto status = db_->commit_batch(*txn_batch_); - txn_batch_.reset(); if (!status.ok()) { - throw_db_error("Failed to commit RocksDB batch", status); + throw_db_error("Failed to read root pid_tid counts", status); } -} - -void IndexDatabase::rollback_transaction() noexcept { txn_batch_.reset(); } - -void IndexDatabase::insert_chunk_bloom_filter( - int file_id, std::uint64_t checkpoint_idx, std::string_view dimension, - std::span blob_data, std::uint64_t num_entries) { - const auto key = chunk_bloom_key(file_id, dimension, checkpoint_idx); - const auto value = encode_bloom_value(blob_data, num_entries); - auto status = txn_batch_ ? db_->put(*txn_batch_, "chunk_bloom", key, value) - : db_->put(key, value, "chunk_bloom"); - if (!status.ok()) { - throw_db_error("Failed to insert chunk bloom filter", status); + try { + DecodeContextGuard ctx("root_pid_tid_counts size=%zu", value.size()); + return decode_count_map_value(value); + } catch (const std::exception& e) { + throw std::runtime_error("Corrupt root_pid_tid_counts payload size=" + + std::to_string(value.size()) + ": " + + e.what()); } } -void IndexDatabase::insert_chunk_bloom_filter( - int file_id, std::uint64_t checkpoint_idx, std::string_view dimension, - const void* blob_data, int blob_size, std::uint64_t num_entries) { - auto* bytes = static_cast(blob_data); - insert_chunk_bloom_filter(file_id, checkpoint_idx, dimension, - std::span( - bytes, static_cast(blob_size)), - num_entries); -} - -void IndexDatabase::insert_file_bloom_filter( - int file_id, std::string_view dimension, - std::span blob_data, std::uint64_t num_entries) { - const auto key = file_bloom_key(file_id, dimension); - const auto value = encode_bloom_value(blob_data, num_entries); - auto status = txn_batch_ ? db_->put(*txn_batch_, "file_bloom", key, value) - : db_->put(key, value, "file_bloom"); - if (!status.ok()) { - throw_db_error("Failed to insert file bloom filter", status); +StringViewMap IndexDatabase::query_root_name_counts() const { + std::string value; + auto status = + db_->get(root_name_counts_key(), &value, cf::ROOT_NAME_COUNTS); + if (status.IsNotFound()) { + return {}; } -} - -void IndexDatabase::insert_file_bloom_filter(int file_id, - std::string_view dimension, - const void* blob_data, - int blob_size, - std::uint64_t num_entries) { - auto* bytes = static_cast(blob_data); - insert_file_bloom_filter(file_id, dimension, - std::span( - bytes, static_cast(blob_size)), - num_entries); -} - -void IndexDatabase::insert_chunk_statistics(int file_id, - std::uint64_t checkpoint_idx, - const ChunkStatistics& stats) { - const auto key = chunk_stats_key(file_id, checkpoint_idx); - const auto value = encode_chunk_statistics_value(stats); - auto status = txn_batch_ ? db_->put(*txn_batch_, "chunk_stats", key, value) - : db_->put(key, value, "chunk_stats"); if (!status.ok()) { - throw_db_error("Failed to insert chunk statistics", status); + throw_db_error("Failed to read root name counts", status); } -} - -void IndexDatabase::insert_checkpoint(int file_id, - const IndexerCheckpoint& checkpoint) { - const auto key = checkpoint_key(file_id, checkpoint.uc_offset, - checkpoint.checkpoint_idx); - const auto value = encode_checkpoint_value(checkpoint); - auto status = txn_batch_ ? db_->put(*txn_batch_, "checkpoints", key, value) - : db_->put(key, value, "checkpoints"); - if (!status.ok()) { - throw_db_error("Failed to insert checkpoint", status); + try { + DecodeContextGuard ctx("root_name_counts size=%zu", value.size()); + return decode_count_map_value(value); + } catch (const std::exception& e) { + throw std::runtime_error("Corrupt root_name_counts payload size=" + + std::to_string(value.size()) + ": " + + e.what()); } } -void IndexDatabase::insert_index_dimension(int file_id, - std::string_view dimension) { - const auto key = make_dimension_key(file_id, dimension); - auto status = txn_batch_ ? db_->put(*txn_batch_, "dimensions", key, "") - : db_->put(key, "", "dimensions"); - if (!status.ok()) { - throw_db_error("Failed to insert index dimension", status); - } -} - -void IndexDatabase::insert_hash_resolution(int file_id, - std::string_view dimension, - std::string_view hash_value, - std::string_view resolved_value) { - const auto owner = make_hash_owner_key(file_id, dimension, hash_value); - const auto forward = make_hash_forward_key(dimension, hash_value); - const auto reverse = - make_hash_reverse_key(dimension, resolved_value, hash_value); - if (txn_batch_) { - db_->put(*txn_batch_, "dimensions", owner, std::string(resolved_value)); - db_->put(*txn_batch_, "dimensions", forward, - std::string(resolved_value)); - db_->put(*txn_batch_, "dimensions", reverse, ""); +void IndexDatabase::merge_root_category_counts_into( + ChunkStatistics& target) const { + std::string value; + auto status = + db_->get(root_category_counts_key(), &value, cf::ROOT_CAT_COUNTS); + if (status.IsNotFound()) { return; } - auto status = db_->put(owner, resolved_value, "dimensions"); - if (!status.ok()) throw_db_error("Failed to insert hash owner", status); - status = db_->put(forward, resolved_value, "dimensions"); - if (!status.ok()) - throw_db_error("Failed to insert hash resolution", status); - status = db_->put(reverse, "", "dimensions"); if (!status.ok()) { - throw_db_error("Failed to insert reverse hash resolution", status); - } -} - -void IndexDatabase::insert_chunk_dimension_stats( - int file_id, std::uint64_t checkpoint_idx, const ChunkDimensionStats& stats, - std::size_t value_counts_cap) { - const auto key = - chunk_dim_stats_key(file_id, checkpoint_idx, stats.dimension); - const auto value = - encode_chunk_dimension_stats_value(stats, value_counts_cap); - auto status = txn_batch_ - ? db_->put(*txn_batch_, "chunk_dim_stats", key, value) - : db_->put(key, value, "chunk_dim_stats"); - if (!status.ok()) { - throw_db_error("Failed to insert chunk dimension stats", status); + throw_db_error("Failed to read root category counts", status); } + for_each_count_map_entry(value, [&target](std::string_view key, + std::uint64_t count) { + auto entry = target.category_counts.try_emplace(std::string(key), 0); + entry.first->second += count; + }); } -void IndexDatabase::insert_tar_archive_metadata(int file_id, - std::string_view archive_name, - std::uint64_t checkpoint_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size, - std::uint64_t total_files) { - const auto key = tar_archive_key(file_id); - const auto value = encode_tar_archive_value( - archive_name, checkpoint_size, total_lines, total_uc_size, total_files); - auto status = txn_batch_ ? db_->put(*txn_batch_, "archives", key, value) - : db_->put(key, value, "archives"); - if (!status.ok()) { - throw_db_error("Failed to insert tar archive metadata", status); +void IndexDatabase::merge_root_pid_tid_counts_into( + ChunkStatistics& target) const { + std::string value; + auto status = + db_->get(root_pid_tid_counts_key(), &value, cf::ROOT_PID_TID_COUNTS); + if (status.IsNotFound()) { + return; } -} - -void IndexDatabase::insert_tar_file(int file_id, const TarFileRecord& record) { - const auto key = - tar_file_key(file_id, record.uncompressed_offset, record.file_name); - const auto value = encode_tar_file_value(record); - auto status = txn_batch_ ? db_->put(*txn_batch_, "tar_files", key, value) - : db_->put(key, value, "tar_files"); if (!status.ok()) { - throw_db_error("Failed to insert tar file metadata", status); - } -} - -std::vector -IndexDatabase::query_chunk_bloom_filters(int file_id, - std::string_view dimension) const { - std::vector results; - std::string prefix = prefix_for_file(file_id); - prefix.append(dimension); - prefix.push_back('\0'); - scan_prefix(*db_, "chunk_bloom", prefix, [&](::rocksdb::Iterator& it) { - results.push_back(decode_chunk_bloom( - iterator_key(it), iterator_value(it), prefix.size() - 1)); - }); - return results; -} - -std::unordered_map> -IndexDatabase::query_chunk_bloom_filters_batch( - int file_id, const std::vector& dimensions) const { - std::unordered_map> results; - for (const auto& dimension : dimensions) { - results.emplace(dimension, - query_chunk_bloom_filters(file_id, dimension)); + throw_db_error("Failed to read root pid_tid counts", status); } - return results; + for_each_count_map_entry( + value, [&target](std::string_view key, std::uint64_t count) { + auto entry = target.pid_tid_counts.try_emplace(std::string(key), 0); + entry.first->second += count; + }); } -std::optional -IndexDatabase::query_file_bloom_filter(int file_id, - std::string_view dimension) const { +void IndexDatabase::merge_root_name_counts_into(ChunkStatistics& target) const { std::string value; auto status = - db_->get(file_bloom_key(file_id, dimension), &value, "file_bloom"); + db_->get(root_name_counts_key(), &value, cf::ROOT_NAME_COUNTS); if (status.IsNotFound()) { - return std::nullopt; + return; } if (!status.ok()) { - throw_db_error("Failed to query file bloom filter", status); + throw_db_error("Failed to read root name counts", status); } - return decode_file_bloom(value); + for_each_count_map_entry( + value, [&target](std::string_view key, std::uint64_t count) { + auto entry = target.name_counts.try_emplace(std::string(key), 0); + entry.first->second += count; + }); } -std::unordered_map -IndexDatabase::query_file_bloom_filters_batch( - int file_id, const std::vector& dimensions) const { - std::unordered_map results; - for (const auto& dimension : dimensions) { - auto bloom = query_file_bloom_filter(file_id, dimension); - if (bloom) { - results.emplace(dimension, std::move(*bloom)); - } +std::vector IndexDatabase::query_name_file_postings( + std::string_view name) const { + auto name_id = query_name_id(name); + if (!name_id) { + return {}; } - return results; -} -std::vector IndexDatabase::query_index_dimensions( - int file_id) const { - std::vector dimensions; - std::string prefix("d|"); - rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); - scan_prefix(*db_, "dimensions", prefix, [&](::rocksdb::Iterator& it) { - auto key = iterator_key(it); - dimensions.push_back(key.substr(prefix.size())); - }); - return dimensions; + std::vector results; + std::string prefix("n|"); + rocks::KeyCodec::append_be64(prefix, *name_id); + scan_prefix( + *db_, cf::NAME_FILE_POSTINGS, prefix, + [&results](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + // "n|" (2) + be64 name_id (8) + be32 file_id (4) = 14 bytes + if (key.size() != 14) return; + results.push_back(static_cast(rocks::KeyCodec::decode_be32( + std::string_view(key.data() + 10, 4)))); + }); + return results; } -bool IndexDatabase::has_index_dimension(int file_id, - std::string_view dimension) const { - std::string value; - return db_ - ->get(make_dimension_key(file_id, dimension), &value, "dimensions") - .ok(); -} +std::vector IndexDatabase::query_name_chunk_postings( + std::string_view name, int file_id) const { + auto name_id = query_name_id(name); + if (!name_id) { + return {}; + } -std::vector -IndexDatabase::query_chunk_statistics(int file_id) const { - std::vector results; - const auto prefix = prefix_for_file(file_id); - scan_prefix(*db_, "chunk_stats", prefix, [&](::rocksdb::Iterator& it) { - ChunkStatisticsResult result; - auto key = iterator_key(it); - result.checkpoint_idx = - rocks::KeyCodec::decode_be64(std::string_view(key).substr(4, 8)); - result.stats = decode_chunk_statistics_value(iterator_value(it)); - results.push_back(std::move(result)); - }); - std::sort(results.begin(), results.end(), - [](const auto& lhs, const auto& rhs) { - return lhs.checkpoint_idx < rhs.checkpoint_idx; - }); + std::vector results; + std::string prefix("n|"); + rocks::KeyCodec::append_be64(prefix, *name_id); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, cf::NAME_CHUNK_POSTINGS, prefix, + [&results](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + // "n|" (2) + be64 name_id (8) + be32 file_id (4) + + // be64 checkpoint_idx (8) = 22 bytes + if (key.size() != 22) return; + results.push_back(rocks::KeyCodec::decode_be64( + std::string_view(key.data() + 14, 8))); + }); return results; } @@ -984,25 +1494,28 @@ bool IndexDatabase::find_checkpoint(int file_id, std::size_t target_offset, bool found = false; const auto prefix = prefix_for_file(file_id); - scan_prefix(*db_, "checkpoints", prefix, [&](::rocksdb::Iterator& it) { - auto decoded = decode_checkpoint(iterator_key(it), iterator_value(it)); - if (decoded.uc_offset <= target_offset && - (!found || decoded.uc_offset >= checkpoint.uc_offset)) { - checkpoint = std::move(decoded); - found = true; - } - }); + scan_prefix(*db_, rocks::cf::CHECKPOINTS, prefix, + [&](::rocksdb::Iterator& it) { + auto decoded = + decode_checkpoint(iterator_key(it), iterator_value(it)); + if (decoded.uc_offset <= target_offset && + (!found || decoded.uc_offset >= checkpoint.uc_offset)) { + checkpoint = std::move(decoded); + found = true; + } + }); return found; } -std::vector IndexDatabase::query_checkpoints( +std::vector IndexDatabase::query_checkpoints( int file_id) const { std::vector checkpoints; const auto prefix = prefix_for_file(file_id); - scan_prefix(*db_, "checkpoints", prefix, [&](::rocksdb::Iterator& it) { - checkpoints.push_back( - decode_checkpoint(iterator_key(it), iterator_value(it))); - }); + scan_prefix( + *db_, rocks::cf::CHECKPOINTS, prefix, [&](::rocksdb::Iterator& it) { + checkpoints.push_back( + decode_checkpoint(iterator_key(it), iterator_value(it))); + }); std::sort(checkpoints.begin(), checkpoints.end(), [](const auto& lhs, const auto& rhs) { return std::tie(lhs.uc_offset, lhs.checkpoint_idx) < @@ -1011,10 +1524,10 @@ std::vector IndexDatabase::query_checkpoints( return checkpoints; } -std::optional -IndexDatabase::query_tar_archive_metadata(int file_id) const { +std::optional IndexDatabase::query_tar_archive_metadata( + int file_id) const { std::string value; - auto status = db_->get(tar_archive_key(file_id), &value, "archives"); + auto status = db_->get(tar_archive_key(file_id), &value, cf::ARCHIVES); if (status.IsNotFound()) { return std::nullopt; } @@ -1024,11 +1537,10 @@ IndexDatabase::query_tar_archive_metadata(int file_id) const { return decode_tar_archive_value(value); } -std::vector IndexDatabase::query_tar_files( - int file_id) const { +std::vector IndexDatabase::query_tar_files(int file_id) const { std::vector files; const auto prefix = prefix_for_file(file_id); - scan_prefix(*db_, "tar_files", prefix, [&](::rocksdb::Iterator& it) { + scan_prefix(*db_, cf::TAR_FILES, prefix, [&](::rocksdb::Iterator& it) { files.push_back(decode_tar_file(iterator_key(it), iterator_value(it))); }); std::sort(files.begin(), files.end(), [](const auto& lhs, const auto& rhs) { @@ -1049,9 +1561,8 @@ bool IndexDatabase::find_tar_file(int file_id, std::string_view file_name, return false; } -std::vector -IndexDatabase::query_tar_files_in_range(int file_id, std::uint64_t start_offset, - std::uint64_t end_offset) const { +std::vector IndexDatabase::query_tar_files_in_range( + int file_id, std::uint64_t start_offset, std::uint64_t end_offset) const { std::vector files; for (auto& entry : query_tar_files(file_id)) { const auto entry_end = entry.uncompressed_offset + entry.file_size; @@ -1063,10 +1574,8 @@ IndexDatabase::query_tar_files_in_range(int file_id, std::uint64_t start_offset, return files; } -std::vector -IndexDatabase::query_checkpoints_for_line_range(int file_id, - std::uint64_t start_line, - std::uint64_t end_line) const { +std::vector IndexDatabase::query_checkpoints_for_line_range( + int file_id, std::uint64_t start_line, std::uint64_t end_line) const { std::vector checkpoints; for (auto& checkpoint : query_checkpoints(file_id)) { if ((checkpoint.first_line_num <= end_line && @@ -1079,7 +1588,7 @@ IndexDatabase::query_checkpoints_for_line_range(int file_id, return checkpoints; } -IndexDatabase::TimeBounds IndexDatabase::query_time_bounds(int file_id) const { +TimeBounds IndexDatabase::query_time_bounds(int file_id) const { TimeBounds bounds; for (const auto& row : query_chunk_statistics(file_id)) { const auto min_ts = row.stats.min_timestamp_us; @@ -1095,14 +1604,15 @@ IndexDatabase::TimeBounds IndexDatabase::query_time_bounds(int file_id) const { return bounds; } -std::vector +std::vector IndexDatabase::query_chunk_dimension_stats(int file_id) const { std::vector results; const auto prefix = prefix_for_file(file_id); - scan_prefix(*db_, "chunk_dim_stats", prefix, [&](::rocksdb::Iterator& it) { - results.push_back(decode_chunk_dimension_stats_value( - iterator_key(it), iterator_value(it))); - }); + scan_prefix(*db_, cf::CHUNK_DIM_STATS, prefix, + [&](::rocksdb::Iterator& it) { + results.push_back(decode_chunk_dimension_stats_value( + iterator_key(it), iterator_value(it))); + }); std::sort(results.begin(), results.end(), [](const auto& lhs, const auto& rhs) { return std::tie(lhs.checkpoint_idx, lhs.dimension) < @@ -1111,189 +1621,80 @@ IndexDatabase::query_chunk_dimension_stats(int file_id) const { return results; } -std::vector -IndexDatabase::query_chunk_dimension_stats_for_dimension( - int file_id, std::string_view dimension) const { - std::vector results; - const auto prefix = prefix_for_file(file_id); - scan_prefix(*db_, "chunk_dim_stats", prefix, [&](::rocksdb::Iterator& it) { - auto decoded = decode_chunk_dimension_stats_value(iterator_key(it), - iterator_value(it)); - if (decoded.dimension == dimension) { - results.push_back(std::move(decoded)); - } - }); - std::sort(results.begin(), results.end(), - [](const auto& lhs, const auto& rhs) { - return lhs.checkpoint_idx < rhs.checkpoint_idx; - }); - return results; -} - -std::optional IndexDatabase::query_resolved_by_hash( - std::string_view dimension, std::string_view hash_value) const { - std::string value; - auto status = db_->get(make_hash_forward_key(dimension, hash_value), &value, - "dimensions"); - if (status.IsNotFound()) { - return std::nullopt; - } - if (!status.ok()) { - throw_db_error("Failed to query resolved hash", status); - } - return value; -} - -std::vector IndexDatabase::query_hash_by_resolved( - std::string_view dimension, std::string_view resolved_value) const { - std::vector hashes; - auto prefix = make_hash_reverse_key(dimension, resolved_value, ""); - scan_prefix(*db_, "dimensions", prefix, [&](::rocksdb::Iterator& it) { - auto key = iterator_key(it); - hashes.push_back(key.substr(prefix.size())); - }); - return hashes; -} - -void IndexDatabase::delete_chunk_bloom_filters(int file_id, - std::string_view dimension) { - std::vector keys; - std::string prefix = prefix_for_file(file_id); - prefix.append(dimension); - prefix.push_back('\0'); - scan_prefix(*db_, "chunk_bloom", prefix, [&](::rocksdb::Iterator& it) { - keys.push_back(iterator_key(it)); - }); - for (const auto& key : keys) { - auto status = txn_batch_ ? db_->del(*txn_batch_, "chunk_bloom", key) - : db_->del(key, "chunk_bloom"); - if (!status.ok()) - throw_db_error("Failed to delete chunk bloom", status); - } -} - -void IndexDatabase::delete_file_bloom_filter(int file_id, - std::string_view dimension) { - auto status = - txn_batch_ ? db_->del(*txn_batch_, "file_bloom", - file_bloom_key(file_id, dimension)) - : db_->del(file_bloom_key(file_id, dimension), "file_bloom"); - if (!status.ok() && !status.IsNotFound()) { - throw_db_error("Failed to delete file bloom", status); - } -} - -void IndexDatabase::delete_chunk_statistics(int file_id) { - std::vector keys; - scan_prefix( - *db_, "chunk_stats", prefix_for_file(file_id), - [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); }); - for (const auto& key : keys) { - auto status = txn_batch_ ? db_->del(*txn_batch_, "chunk_stats", key) - : db_->del(key, "chunk_stats"); - if (!status.ok()) { - throw_db_error("Failed to delete chunk statistics", status); - } - } -} - -void IndexDatabase::delete_chunk_dimension_stats(int file_id) { - std::vector keys; - scan_prefix( - *db_, "chunk_dim_stats", prefix_for_file(file_id), - [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); }); - for (const auto& key : keys) { - auto status = txn_batch_ ? db_->del(*txn_batch_, "chunk_dim_stats", key) - : db_->del(key, "chunk_dim_stats"); - if (!status.ok()) { - throw_db_error("Failed to delete chunk dimension stats", status); - } - } -} - -void IndexDatabase::delete_hash_resolutions(int file_id) { - std::vector> owned; - std::string prefix("o|"); - rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); - prefix.push_back('\0'); - scan_prefix(*db_, "dimensions", prefix, [&](::rocksdb::Iterator& it) { - owned.emplace_back(iterator_key(it), iterator_value(it)); - }); - for (const auto& [owner_key, resolved] : owned) { - if (owner_key.size() <= prefix.size()) { - DFTRACER_UTILS_LOG_WARN( - "Skipping malformed owner key for file_id=%d", file_id); - continue; +std::unordered_map> +IndexDatabase::query_chunk_dimension_stats_batch( + const std::vector& file_ids) const { + std::unordered_map> results; + if (file_ids.empty()) { + return results; + } + results.reserve(file_ids.size()); + + std::unordered_set wanted(file_ids.begin(), file_ids.end()); + const auto [min_it, max_it] = + std::minmax_element(file_ids.begin(), file_ids.end()); + const auto min_prefix = prefix_for_file(*min_it); + const int max_file_id = *max_it; + + auto it = db_->new_iterator(cf::CHUNK_DIM_STATS); + for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size())); + it->Valid(); it->Next()) { + auto key = iterator_key(*it); + int file_id = decode_prefixed_file_id(key); + if (file_id > max_file_id) { + break; } - const std::string_view payload(owner_key.data() + prefix.size(), - owner_key.size() - prefix.size()); - auto split = payload.find('\0'); - if (split == std::string_view::npos) { - DFTRACER_UTILS_LOG_WARN( - "Skipping malformed owner key payload for file_id=%d", file_id); + if (!wanted.contains(file_id)) { continue; } - auto dimension = payload.substr(0, split); - auto hash_value = payload.substr(split + 1); - auto forward = make_hash_forward_key(dimension, hash_value); - auto reverse = make_hash_reverse_key(dimension, resolved, hash_value); - const auto del_one = [&](std::string_view key) { - auto status = txn_batch_ ? db_->del(*txn_batch_, "dimensions", key) - : db_->del(key, "dimensions"); - if (!status.ok() && !status.IsNotFound()) { - throw_db_error("Failed to delete hash resolution", status); - } - }; - del_one(owner_key); - del_one(forward); - del_one(reverse); + + results[file_id].push_back( + decode_chunk_dimension_stats_value(key, iterator_value(*it))); } -} -void IndexDatabase::insert_event_range( - int file_id, std::uint64_t checkpoint_idx, std::string_view cat, - std::string_view name, std::span line_numbers) { - const auto key = manifest_event_key(file_id, checkpoint_idx, cat, name); - const auto value = encode_event_range_value(line_numbers); - auto status = txn_batch_ ? db_->put(*txn_batch_, "manifest", key, value) - : db_->put(key, value, "manifest"); + const auto status = it->status(); if (!status.ok()) { - throw_db_error("Failed to insert event range", status); + throw IndexerError(IndexerError::Type::DATABASE_ERROR, + "Failed to batch query chunk dimension stats: " + + status.ToString()); } -} - -void IndexDatabase::insert_event_range( - int file_id, std::uint64_t checkpoint_idx, std::string_view cat, - std::string_view name, const std::vector& line_numbers) { - insert_event_range(file_id, checkpoint_idx, cat, name, - std::span(line_numbers)); -} -void IndexDatabase::insert_metadata_lines( - int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type, - std::span line_numbers) { - const auto key = manifest_metadata_key(file_id, checkpoint_idx, meta_type); - const auto value = encode_metadata_value(line_numbers); - auto status = txn_batch_ ? db_->put(*txn_batch_, "manifest", key, value) - : db_->put(key, value, "manifest"); - if (!status.ok()) { - throw_db_error("Failed to insert metadata lines", status); + for (auto& [_, entries] : results) { + std::sort(entries.begin(), entries.end(), + [](const auto& lhs, const auto& rhs) { + return std::tie(lhs.checkpoint_idx, lhs.dimension) < + std::tie(rhs.checkpoint_idx, rhs.dimension); + }); } + return results; } -void IndexDatabase::insert_metadata_lines( - int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type, - const std::vector& line_numbers) { - insert_metadata_lines(file_id, checkpoint_idx, meta_type, - std::span(line_numbers)); +std::vector +IndexDatabase::query_chunk_dimension_stats_for_dimension( + int file_id, std::string_view dimension) const { + std::vector results; + const auto prefix = prefix_for_file(file_id); + scan_prefix(*db_, cf::CHUNK_DIM_STATS, prefix, + [&](::rocksdb::Iterator& it) { + auto decoded = decode_chunk_dimension_stats_value( + iterator_key(it), iterator_value(it)); + if (decoded.dimension == dimension) { + results.push_back(std::move(decoded)); + } + }); + std::sort(results.begin(), results.end(), + [](const auto& lhs, const auto& rhs) { + return lhs.checkpoint_idx < rhs.checkpoint_idx; + }); + return results; } -std::vector IndexDatabase::query_event_ranges( +std::vector IndexDatabase::query_event_ranges( int file_id) const { std::vector results; std::string prefix("E|"); rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); - scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) { + scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) { auto key = iterator_key(it); auto payload = std::string_view(key).substr(2 + 4 + 8); auto split = payload.find('\0'); @@ -1319,8 +1720,7 @@ std::vector IndexDatabase::query_event_ranges( return results; } -std::vector -IndexDatabase::query_event_ranges_for_checkpoint( +std::vector IndexDatabase::query_event_ranges_for_checkpoint( int file_id, std::uint64_t checkpoint_idx) const { std::vector results; for (auto& range : query_event_ranges(file_id)) { @@ -1331,12 +1731,12 @@ IndexDatabase::query_event_ranges_for_checkpoint( return results; } -std::vector -IndexDatabase::query_metadata_lines(int file_id) const { +std::vector IndexDatabase::query_metadata_lines( + int file_id) const { std::vector results; std::string prefix("M|"); rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); - scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) { + scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) { auto key = iterator_key(it); MetadataLinesResult result; result.checkpoint_idx = @@ -1355,7 +1755,7 @@ IndexDatabase::query_metadata_lines(int file_id) const { return results; } -std::vector +std::vector IndexDatabase::query_metadata_lines_for_checkpoint( int file_id, std::uint64_t checkpoint_idx) const { std::vector results; @@ -1367,36 +1767,77 @@ IndexDatabase::query_metadata_lines_for_checkpoint( return results; } -void IndexDatabase::delete_event_ranges(int file_id) { - std::vector keys; - std::string prefix("E|"); - rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); - scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) { - keys.push_back(iterator_key(it)); - }); - for (const auto& key : keys) { - auto status = txn_batch_ ? db_->del(*txn_batch_, "manifest", key) - : db_->del(key, "manifest"); - if (!status.ok()) { - throw_db_error("Failed to delete manifest event ranges", status); +std::unordered_set IndexDatabase::query_file_pids( + int file_id) const { + std::unordered_set pids; + std::string key("P|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + + std::string value; + auto status = db_->get(key, &value, cf::MANIFEST); + if (status.IsNotFound()) { + return pids; + } + if (!status.ok()) { + throw_db_error("Failed to read file PIDs", status); + } + + // Decode: count (varint) + sorted PIDs (each as varint) + std::size_t off = 0; + auto decode_varint = [&value, &off]() -> std::uint64_t { + std::uint64_t v = 0; + unsigned shift = 0; + while (off < value.size()) { + auto b = static_cast(value[off++]); + v |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) return v; + shift += 7; } + return v; + }; + + auto count = decode_varint(); + pids.reserve(count); + for (std::uint64_t i = 0; i < count; ++i) { + pids.insert(decode_varint()); } + return pids; } -void IndexDatabase::delete_metadata_lines(int file_id) { - std::vector keys; - std::string prefix("M|"); - rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); - scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) { - keys.push_back(iterator_key(it)); - }); - for (const auto& key : keys) { - auto status = txn_batch_ ? db_->del(*txn_batch_, "manifest", key) - : db_->del(key, "manifest"); - if (!status.ok()) { - throw_db_error("Failed to delete metadata lines", status); +std::unordered_map> +IndexDatabase::query_all_file_pids() const { + std::unordered_map> result; + std::string prefix("P|"); + scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + // Key: "P|" + file_id (4 bytes BE) + auto file_id = static_cast( + rocks::KeyCodec::decode_be32(std::string_view(key).substr(2, 4))); + + auto value = iterator_value(it); + std::size_t off = 0; + + auto decode_varint = [&value, &off]() -> std::uint64_t { + std::uint64_t v = 0; + unsigned shift = 0; + while (off < value.size()) { + auto b = static_cast(value[off++]); + v |= static_cast(b & 0x7F) << shift; + if ((b & 0x80) == 0) return v; + shift += 7; + } + return v; + }; + + auto count = decode_varint(); + std::unordered_set pids; + pids.reserve(count); + for (std::uint64_t i = 0; i < count; ++i) { + pids.insert(decode_varint()); } - } + result[file_id] = std::move(pids); + }); + return result; } std::uint64_t IndexDatabase::get_total_events(int file_id) const { @@ -1407,23 +1848,9 @@ std::uint64_t IndexDatabase::get_total_events(int file_id) const { return total > 0 ? total : get_num_lines(file_id); } -void IndexDatabase::insert_file_metadata(int file_id, - std::uint64_t checkpoint_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size) { - const auto key = metadata_key(file_id); - const auto value = - encode_metadata_record(checkpoint_size, total_lines, total_uc_size); - auto status = txn_batch_ ? db_->put(*txn_batch_, "metadata", key, value) - : db_->put(key, value, "metadata"); - if (!status.ok()) { - throw_db_error("Failed to insert metadata", status); - } -} - std::uint64_t IndexDatabase::get_checkpoint_size(int file_id) const { std::string value; - auto status = db_->get(metadata_key(file_id), &value, "metadata"); + auto status = db_->get(metadata_key(file_id), &value, cf::METADATA); if (status.IsNotFound()) { return 0; } @@ -1435,7 +1862,7 @@ std::uint64_t IndexDatabase::get_checkpoint_size(int file_id) const { std::uint64_t IndexDatabase::get_num_lines(int file_id) const { std::string value; - auto status = db_->get(metadata_key(file_id), &value, "metadata"); + auto status = db_->get(metadata_key(file_id), &value, cf::METADATA); if (status.IsNotFound()) { return 0; } @@ -1447,7 +1874,7 @@ std::uint64_t IndexDatabase::get_num_lines(int file_id) const { std::uint64_t IndexDatabase::get_max_bytes(int file_id) const { std::string value; - auto status = db_->get(metadata_key(file_id), &value, "metadata"); + auto status = db_->get(metadata_key(file_id), &value, cf::METADATA); if (status.IsNotFound()) { return 0; } @@ -1457,52 +1884,139 @@ std::uint64_t IndexDatabase::get_max_bytes(int file_id) const { return decode_metadata_record(value)[2]; } -void IndexDatabase::delete_file_data(int file_id) { - auto delete_default_key = [&](std::string_view key) { - auto del_status = - txn_batch_ ? db_->del(*txn_batch_, "default", key) : db_->del(key); - if (!del_status.ok() && !del_status.IsNotFound()) { - throw_db_error("Failed to delete file registry entry", del_status); - } - }; +void IndexDatabase::ensure_hash_tables_cached() const { + if (!hash_cache_) { + hash_cache_ = std::make_unique(); + } - const auto logical_name_key = file_reverse_key(file_id); - std::string logical_name; - auto status = db_->get(logical_name_key, &logical_name); - if (status.ok()) { - delete_default_key(file_lookup_key(logical_name)); - delete_default_key(logical_name_key); - } else if (!status.IsNotFound()) { - throw_db_error("Failed to read reverse file registry", status); + { + std::shared_lock lock(hash_cache_->mutex); + if (hash_cache_->loaded) return; } - auto delete_prefix = [&](std::string_view cf, std::string_view prefix) { - std::vector keys; - scan_prefix(*db_, cf, prefix, [&](::rocksdb::Iterator& it) { - keys.push_back(iterator_key(it)); - }); - for (const auto& key : keys) { - auto del_status = - txn_batch_ ? db_->del(*txn_batch_, cf, key) : db_->del(key, cf); - if (!del_status.ok() && !del_status.IsNotFound()) { - throw_db_error("Failed to delete file-scoped RocksDB data", - del_status); - } - } - }; + std::unique_lock lock(hash_cache_->mutex); + if (hash_cache_->loaded) return; + + scan_prefix(*db_, cf::HASH_TABLES, "", [this](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + if (key.empty()) return; + auto type = static_cast(key[0]); + auto payload = key.substr(1); + auto value = iterator_value(it); - delete_prefix("checkpoints", prefix_for_file(file_id)); - delete_prefix("metadata", prefix_for_file(file_id)); - delete_prefix("archives", prefix_for_file(file_id)); - delete_prefix("tar_files", prefix_for_file(file_id)); - delete_prefix("chunk_bloom", prefix_for_file(file_id)); - delete_prefix("file_bloom", prefix_for_file(file_id)); - delete_prefix("chunk_stats", prefix_for_file(file_id)); - delete_prefix("chunk_dim_stats", prefix_for_file(file_id)); - delete_prefix("dimensions", std::string("d|") + prefix_for_file(file_id)); - delete_prefix("manifest", std::string("E|") + prefix_for_file(file_id)); - delete_prefix("manifest", std::string("M|") + prefix_for_file(file_id)); - delete_hash_resolutions(file_id); + switch (type) { + case 0: + hash_cache_->file_hash.emplace(payload, value); + break; + case 1: + hash_cache_->host_hash.emplace(payload, value); + break; + case 2: + hash_cache_->string_hash.emplace(payload, value); + break; + case 3: + hash_cache_->proc_hash.emplace(payload, value); + break; + case 4: + hash_cache_->file_name.emplace(payload, value); + break; + case 5: + hash_cache_->host_name.emplace(payload, value); + break; + case 6: + hash_cache_->string_name.emplace(payload, value); + break; + case 7: + hash_cache_->proc_name.emplace(payload, value); + break; + default: + break; + } + }); + hash_cache_->loaded = true; +} + +std::unordered_map IndexDatabase::query_hash_table( + HashType type) const { + ensure_hash_tables_cached(); + std::shared_lock lock(hash_cache_->mutex); + switch (type) { + case HashType::FILE: + return hash_cache_->file_hash; + case HashType::HOST: + return hash_cache_->host_hash; + case HashType::STRING: + return hash_cache_->string_hash; + case HashType::PROC: + return hash_cache_->proc_hash; + } + return {}; +} + +std::optional IndexDatabase::resolve_hash( + HashType type, std::string_view hash) const { + ensure_hash_tables_cached(); + std::shared_lock lock(hash_cache_->mutex); + const std::unordered_map* cache = nullptr; + switch (type) { + case HashType::FILE: + cache = &hash_cache_->file_hash; + break; + case HashType::HOST: + cache = &hash_cache_->host_hash; + break; + case HashType::STRING: + cache = &hash_cache_->string_hash; + break; + case HashType::PROC: + cache = &hash_cache_->proc_hash; + break; + } + if (cache) { + auto it = cache->find(std::string(hash)); + if (it != cache->end()) return it->second; + } + return std::nullopt; +} + +std::optional IndexDatabase::resolve_name_to_hash( + HashType type, std::string_view name) const { + ensure_hash_tables_cached(); + std::shared_lock lock(hash_cache_->mutex); + const std::unordered_map* cache = nullptr; + switch (type) { + case HashType::FILE: + cache = &hash_cache_->file_name; + break; + case HashType::HOST: + cache = &hash_cache_->host_name; + break; + case HashType::STRING: + cache = &hash_cache_->string_name; + break; + case HashType::PROC: + cache = &hash_cache_->proc_name; + break; + } + if (cache) { + auto it = cache->find(std::string(name)); + if (it != cache->end()) return it->second; + } + return std::nullopt; +} + +std::unordered_map> +IndexDatabase::query_all_hash_tables() const { + ensure_hash_tables_cached(); + std::shared_lock lock(hash_cache_->mutex); + std::unordered_map> + result; + result[HashType::FILE] = hash_cache_->file_hash; + result[HashType::HOST] = hash_cache_->host_hash; + result[HashType::STRING] = hash_cache_->string_hash; + result[HashType::PROC] = hash_cache_->proc_hash; + return result; } } // namespace dftracer::utils::utilities::indexer diff --git a/src/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp b/src/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp new file mode 100644 index 00000000..d77473d3 --- /dev/null +++ b/src/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp @@ -0,0 +1,399 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace dftracer::utils::utilities::indexer { + +namespace { + +namespace encoding = internal::encoding; + +[[noreturn]] void throw_sst_error(std::string_view message, + const ::rocksdb::Status& status) { + throw internal::IndexerError( + internal::IndexerError::Type::DATABASE_ERROR, + std::string(message) + ": " + status.ToString()); +} + +std::string emit_sst(const std::string& path, + std::vector>& buffer) { + std::sort(buffer.begin(), buffer.end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + // SstFileWriter requires strict ascending keys. Buffers like + // name_dictionary and hash_tables collect duplicate (key, value) pairs + // when a batch spans multiple files that share event names / content + // hashes. Drop consecutive duplicates so the writer sees unique keys. + buffer.erase(std::unique(buffer.begin(), buffer.end(), + [](const auto& a, const auto& b) { + return a.first == b.first; + }), + buffer.end()); + + ::rocksdb::EnvOptions env_opts; + ::rocksdb::Options writer_options( + rocksdb::RocksDatabase::default_options(), + rocksdb::RocksDatabase::default_column_family_options()); + ::rocksdb::SstFileWriter writer(env_opts, writer_options); + + auto status = writer.Open(path); + if (!status.ok()) { + throw_sst_error("Failed to open SST writer at '" + path + "'", status); + } + + for (const auto& [key, value] : buffer) { + status = writer.Put(key, value); + if (!status.ok()) { + throw_sst_error("Failed to append to SST '" + path + "'", status); + } + } + + status = writer.Finish(); + if (!status.ok()) { + throw_sst_error("Failed to finalize SST '" + path + "'", status); + } + + return path; +} + +/// Emit a mixed Put+Merge SST for the AGGREGATION / SYSTEM_METRICS CFs. +/// Entries are sorted by key; each entry's `is_merge` decides whether to +/// call SstFileWriter::Merge (operand, combined by merge_operator at +/// read/compaction time) or ::Put (overrides prior values). +std::string emit_mixed_sst( + const std::string& path, + std::vector& buffer) { + std::sort(buffer.begin(), buffer.end(), + [](const auto& a, const auto& b) { return a.key < b.key; }); + + ::rocksdb::EnvOptions env_opts; + ::rocksdb::Options writer_options( + rocksdb::RocksDatabase::default_options(), + rocksdb::RocksDatabase::default_column_family_options()); + ::rocksdb::SstFileWriter writer(env_opts, writer_options); + + auto status = writer.Open(path); + if (!status.ok()) { + throw_sst_error("Failed to open SST writer at '" + path + "'", status); + } + for (const auto& entry : buffer) { + status = entry.is_merge ? writer.Merge(entry.key, entry.value) + : writer.Put(entry.key, entry.value); + if (!status.ok()) { + throw_sst_error("Failed to append to SST '" + path + "'", status); + } + } + status = writer.Finish(); + if (!status.ok()) { + throw_sst_error("Failed to finalize SST '" + path + "'", status); + } + return path; +} + +} // namespace + +namespace { + +/// Move one file from `src` to `dst`. Uses rename (O(1) same-FS) with a +/// copy+unlink fallback for cross-FS. `dst` parent directory must exist. +void move_file(const fs::path& src, const fs::path& dst) { + std::error_code ec; + fs::rename(src, dst, ec); + if (!ec) return; + // Cross-FS or other rename failure -> fall back to copy. + ec.clear(); + fs::copy_file(src, dst, fs::copy_options::overwrite_existing, ec); + if (ec) { + throw std::runtime_error("Failed to move SST '" + src.string() + + "' to '" + dst.string() + + "': " + ec.message()); + } + fs::remove(src, ec); // best-effort; staging cleanup handled by caller +} + +void move_one(const fs::path& dest_dir, std::optional& src_slot, + std::optional& dst_slot) { + if (!src_slot.has_value()) return; + fs::path src(*src_slot); + fs::path dst = dest_dir / src.filename(); + move_file(src, dst); + dst_slot = dst.string(); + src_slot.reset(); +} + +} // namespace + +IndexDatabaseSstWriterContext::Artifacts +IndexDatabaseSstWriterContext::Artifacts::move_to( + std::string_view dest_dir) && { + const fs::path dir(dest_dir); + std::error_code ec; + fs::create_directories(dir, ec); + if (ec) { + throw std::runtime_error("Failed to create SST move destination '" + + std::string(dest_dir) + "': " + ec.message()); + } + + Artifacts out; + move_one(dir, metadata_sst, out.metadata_sst); + move_one(dir, checkpoints_sst, out.checkpoints_sst); + move_one(dir, manifest_sst, out.manifest_sst); + move_one(dir, chunk_bloom_sst, out.chunk_bloom_sst); + move_one(dir, file_bloom_sst, out.file_bloom_sst); + move_one(dir, chunk_stats_sst, out.chunk_stats_sst); + move_one(dir, chunk_dim_stats_sst, out.chunk_dim_stats_sst); + move_one(dir, dimensions_sst, out.dimensions_sst); + move_one(dir, file_scalar_stats_sst, out.file_scalar_stats_sst); + move_one(dir, file_cat_counts_sst, out.file_cat_counts_sst); + move_one(dir, file_pid_tid_counts_sst, out.file_pid_tid_counts_sst); + move_one(dir, file_name_counts_sst, out.file_name_counts_sst); + move_one(dir, name_dictionary_sst, out.name_dictionary_sst); + move_one(dir, name_file_postings_sst, out.name_file_postings_sst); + move_one(dir, name_chunk_postings_sst, out.name_chunk_postings_sst); + move_one(dir, hash_tables_sst, out.hash_tables_sst); + move_one(dir, aggregation_sst, out.aggregation_sst); + move_one(dir, system_metrics_sst, out.system_metrics_sst); + return out; +} + +IndexDatabaseSstWriterContext::IndexDatabaseSstWriterContext( + std::string staging_dir, std::string batch_id) + : staging_dir_(std::move(staging_dir)), batch_id_(std::move(batch_id)) { + std::error_code ec; + fs::create_directories(fs::path(staging_dir_) / batch_id_, ec); + if (ec) { + throw std::runtime_error("Failed to create SST staging dir '" + + staging_dir_ + "/" + batch_id_ + + "': " + ec.message()); + } +} + +IndexDatabaseSstWriterContext::IndexDatabaseSstWriterContext( + IndexDatabaseSstWriterContext&&) noexcept = default; +IndexDatabaseSstWriterContext& IndexDatabaseSstWriterContext::operator=( + IndexDatabaseSstWriterContext&&) noexcept = default; +IndexDatabaseSstWriterContext::~IndexDatabaseSstWriterContext() = default; + +void IndexDatabaseSstWriterContext::insert_file_metadata( + int file_id, std::uint64_t checkpoint_size, std::uint64_t total_lines, + std::uint64_t total_uc_size) { + metadata_buf_.emplace_back( + encoding::metadata_key(file_id), + encoding::encode_metadata_record(checkpoint_size, total_lines, + total_uc_size)); +} + +void IndexDatabaseSstWriterContext::insert_checkpoint( + int file_id, const IndexerCheckpoint& checkpoint) { + checkpoints_buf_.emplace_back( + encoding::checkpoint_key(file_id, checkpoint.uc_offset, + checkpoint.checkpoint_idx), + encoding::encode_checkpoint_value(checkpoint)); +} + +void IndexDatabaseSstWriterContext::insert_event_range( + int file_id, std::uint64_t checkpoint_idx, std::string_view cat, + std::string_view name, std::span line_numbers) { + manifest_buf_.emplace_back( + encoding::manifest_event_key(file_id, checkpoint_idx, cat, name), + encoding::encode_event_range_value(line_numbers)); +} + +void IndexDatabaseSstWriterContext::insert_metadata_lines( + int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type, + std::span line_numbers) { + manifest_buf_.emplace_back( + encoding::manifest_metadata_key(file_id, checkpoint_idx, meta_type), + encoding::encode_metadata_value(line_numbers)); +} + +void IndexDatabaseSstWriterContext::insert_file_pids( + int file_id, const std::unordered_set& pids) { + manifest_buf_.emplace_back(encoding::file_pids_key(file_id), + encoding::encode_file_pids_value(pids)); +} + +void IndexDatabaseSstWriterContext::insert_chunk_bloom_filter( + int file_id, std::uint64_t checkpoint_idx, std::string_view dimension, + std::span blob_data, std::uint64_t num_entries) { + chunk_bloom_buf_.emplace_back( + encoding::chunk_bloom_key(file_id, dimension, checkpoint_idx), + encoding::encode_bloom_value(blob_data, num_entries)); +} + +void IndexDatabaseSstWriterContext::insert_file_bloom_filter( + int file_id, std::string_view dimension, + std::span blob_data, std::uint64_t num_entries) { + file_bloom_buf_.emplace_back( + encoding::file_bloom_key(file_id, dimension), + encoding::encode_bloom_value(blob_data, num_entries)); +} + +void IndexDatabaseSstWriterContext::insert_chunk_statistics( + int file_id, std::uint64_t checkpoint_idx, const ChunkStatistics& stats) { + chunk_stats_buf_.emplace_back( + encoding::chunk_stats_key(file_id, checkpoint_idx), + encoding::encode_chunk_statistics_value(stats)); +} + +void IndexDatabaseSstWriterContext::insert_file_scalar_stats( + int file_id, const ChunkStatistics& stats, std::uint64_t num_chunks) { + file_scalar_stats_buf_.emplace_back( + encoding::file_scalar_stats_key(file_id), + internal::encode_file_scalar_stats_value(stats, num_chunks)); +} + +void IndexDatabaseSstWriterContext::insert_file_category_counts( + int file_id, const StringViewMap& counts) { + file_cat_counts_buf_.emplace_back( + encoding::file_category_counts_key(file_id), + encoding::encode_count_map_value(counts)); +} + +void IndexDatabaseSstWriterContext::insert_file_pid_tid_counts( + int file_id, const StringViewMap& counts) { + file_pid_tid_counts_buf_.emplace_back( + encoding::file_pid_tid_counts_key(file_id), + encoding::encode_count_map_value(counts)); +} + +void IndexDatabaseSstWriterContext::insert_file_name_counts( + int file_id, const StringViewMap& counts) { + file_name_counts_buf_.emplace_back( + encoding::file_name_counts_key(file_id), + encoding::encode_name_summary_value(counts, /*other_count=*/0, + /*unique_count=*/counts.size())); +} + +void IndexDatabaseSstWriterContext::insert_index_dimension( + int file_id, std::string_view dimension) { + dimensions_buf_.emplace_back( + encoding::make_dimension_key(file_id, dimension), std::string{}); +} + +void IndexDatabaseSstWriterContext::insert_chunk_dimension_stats( + int file_id, std::uint64_t checkpoint_idx, const ChunkDimensionStats& stats, + std::size_t value_counts_cap) { + chunk_dim_stats_buf_.emplace_back( + encoding::chunk_dim_stats_key(file_id, checkpoint_idx, stats.dimension), + encoding::encode_chunk_dimension_stats_value(stats, value_counts_cap)); +} + +void IndexDatabaseSstWriterContext::insert_name_dictionary_entry( + std::uint64_t name_id, std::string_view name) { + name_dictionary_buf_.emplace_back( + encoding::name_lookup_key(name), + ::dftracer::utils::rocksdb::KeyCodec::encode_be64(name_id)); + name_dictionary_buf_.emplace_back(encoding::name_reverse_key(name_id), + std::string(name)); +} + +void IndexDatabaseSstWriterContext::insert_name_file_posting( + std::uint64_t name_id, int file_id) { + name_file_postings_buf_.emplace_back( + encoding::name_file_posting_key(name_id, file_id), std::string{}); + name_file_postings_buf_.emplace_back( + encoding::name_file_owner_key(file_id, name_id), std::string{}); +} + +void IndexDatabaseSstWriterContext::insert_name_chunk_posting( + std::uint64_t name_id, int file_id, std::uint64_t checkpoint_idx) { + name_chunk_postings_buf_.emplace_back( + encoding::name_chunk_posting_key(name_id, file_id, checkpoint_idx), + std::string{}); + name_chunk_postings_buf_.emplace_back( + encoding::name_chunk_owner_key(file_id, name_id, checkpoint_idx), + std::string{}); +} + +void IndexDatabaseSstWriterContext::insert_hash_table_entry( + std::uint8_t type, std::string_view hash, std::string_view name) { + hash_tables_buf_.emplace_back(encoding::hash_table_forward_key(type, hash), + std::string(name)); + hash_tables_buf_.emplace_back(encoding::hash_table_reverse_key(type, name), + std::string(hash)); +} + +void IndexDatabaseSstWriterContext::insert_aggregation_merge( + std::string_view key, std::string_view operand) { + aggregation_buf_.emplace_back(MergeableKeyValue{ + std::string(key), std::string(operand), /*is_merge=*/true}); +} + +void IndexDatabaseSstWriterContext::insert_aggregation_put( + std::string_view key, std::string_view value) { + aggregation_buf_.emplace_back(MergeableKeyValue{ + std::string(key), std::string(value), /*is_merge=*/false}); +} + +void IndexDatabaseSstWriterContext::insert_system_metrics_merge( + std::string_view key, std::string_view operand) { + system_metrics_buf_.emplace_back(MergeableKeyValue{ + std::string(key), std::string(operand), /*is_merge=*/true}); +} + +IndexDatabaseSstWriterContext::Artifacts +IndexDatabaseSstWriterContext::commit() { + Artifacts out; + if (committed_) { + return out; + } + committed_ = true; + + const auto batch_dir = fs::path(staging_dir_) / batch_id_; + + auto emit_into = [&](const char* name, std::vector& buf, + std::optional& slot) { + if (buf.empty()) return; + slot = emit_sst((batch_dir / name).string(), buf); + buf.clear(); + buf.shrink_to_fit(); + }; + + emit_into("metadata.sst", metadata_buf_, out.metadata_sst); + emit_into("checkpoints.sst", checkpoints_buf_, out.checkpoints_sst); + emit_into("manifest.sst", manifest_buf_, out.manifest_sst); + emit_into("chunk_bloom.sst", chunk_bloom_buf_, out.chunk_bloom_sst); + emit_into("file_bloom.sst", file_bloom_buf_, out.file_bloom_sst); + emit_into("chunk_stats.sst", chunk_stats_buf_, out.chunk_stats_sst); + emit_into("chunk_dim_stats.sst", chunk_dim_stats_buf_, + out.chunk_dim_stats_sst); + emit_into("dimensions.sst", dimensions_buf_, out.dimensions_sst); + emit_into("file_scalar_stats.sst", file_scalar_stats_buf_, + out.file_scalar_stats_sst); + emit_into("file_cat_counts.sst", file_cat_counts_buf_, + out.file_cat_counts_sst); + emit_into("file_pid_tid_counts.sst", file_pid_tid_counts_buf_, + out.file_pid_tid_counts_sst); + emit_into("file_name_counts.sst", file_name_counts_buf_, + out.file_name_counts_sst); + emit_into("name_dictionary.sst", name_dictionary_buf_, + out.name_dictionary_sst); + emit_into("name_file_postings.sst", name_file_postings_buf_, + out.name_file_postings_sst); + emit_into("name_chunk_postings.sst", name_chunk_postings_buf_, + out.name_chunk_postings_sst); + emit_into("hash_tables.sst", hash_tables_buf_, out.hash_tables_sst); + + auto emit_mixed_into = [&](const char* name, + std::vector& buf, + std::optional& slot) { + if (buf.empty()) return; + slot = emit_mixed_sst((batch_dir / name).string(), buf); + buf.clear(); + buf.shrink_to_fit(); + }; + emit_mixed_into("aggregation.sst", aggregation_buf_, out.aggregation_sst); + emit_mixed_into("system_metrics.sst", system_metrics_buf_, + out.system_metrics_sst); + + return out; +} + +} // namespace dftracer::utils::utilities::indexer diff --git a/src/dftracer/utils/utilities/indexer/index_database_writer_context.cpp b/src/dftracer/utils/utilities/indexer/index_database_writer_context.cpp new file mode 100644 index 00000000..611eb2f9 --- /dev/null +++ b/src/dftracer/utils/utilities/indexer/index_database_writer_context.cpp @@ -0,0 +1,1279 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer { + +namespace rocks = dftracer::utils::rocksdb; +namespace cf = rocks::cf; + +using namespace internal; + +namespace { + +using encoding::checkpoint_key; +using encoding::chunk_bloom_key; +using encoding::chunk_dim_stats_key; +using encoding::chunk_stats_key; +using encoding::encode_bloom_value; +using encoding::encode_checkpoint_value; +using encoding::encode_chunk_dimension_stats_value; +using encoding::encode_chunk_statistics_value; +using encoding::encode_count_map_value; +using encoding::encode_event_range_value; +using encoding::encode_metadata_record; +using encoding::encode_metadata_value; +using encoding::encode_name_summary_value; +using encoding::file_bloom_key; +using encoding::file_category_counts_key; +using encoding::file_name_counts_key; +using encoding::file_pid_tid_counts_key; +using encoding::file_scalar_stats_key; +using encoding::make_dimension_key; +using encoding::manifest_event_key; +using encoding::name_chunk_owner_key; +using encoding::name_chunk_owner_prefix; +using encoding::name_chunk_posting_key; +using encoding::name_file_owner_key; +using encoding::name_file_owner_prefix; +using encoding::name_file_posting_key; +using encoding::name_lookup_key; +using encoding::name_reverse_key; + +namespace hash = dftracer::utils::utilities::hash; +using encoding::manifest_metadata_key; +using encoding::metadata_key; +using encoding::prefix_for_file; + +constexpr std::uint32_t SCHEMA_VERSION = 1; + +[[noreturn]] void throw_db_error(std::string_view message, + const ::rocksdb::Status& status) { + throw IndexerError(IndexerError::Type::DATABASE_ERROR, + std::string(message) + ": " + status.ToString()); +} + +std::string file_lookup_key(std::string_view logical_name) { + return std::string("f|") + std::string(logical_name); +} + +std::string file_reverse_key(int file_id) { + std::string key("r|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + return key; +} + +std::string next_file_id_key() { + return std::string(encoding::NEXT_FILE_ID_KEY); +} +std::string schema_version_key() { return "_schema_version"; } + +std::string encode_file_record( + int file_id, std::uint64_t file_hash, + IndexFileEntryCapability caps = IndexFileEntryCapability::NONE) { + std::string value; + rocks::KeyCodec::append_be32(value, static_cast(file_id)); + value.push_back(static_cast(static_cast(caps))); + value.append(7, '\0'); + append_u64(value, 0); + append_u64(value, file_hash); + return value; +} + +IndexFileEntryCapability decode_file_capabilities(std::string_view record) { + if (record.size() < 5) { + return IndexFileEntryCapability::NONE; + } + return static_cast( + static_cast(record[4])); +} + +int decode_file_id(std::string_view record) { + if (record.size() < 4) { + throw std::runtime_error("Corrupt file record"); + } + return static_cast(rocks::KeyCodec::decode_be32(record.substr(0, 4))); +} + +int decode_prefixed_file_id(std::string_view key) { + if (key.size() < 4) { + throw std::runtime_error("Corrupt file-prefixed key"); + } + return static_cast(rocks::KeyCodec::decode_be32(key.substr(0, 4))); +} + +std::uint64_t decode_file_hash(std::string_view record) { + if (record.size() < 28) { + throw std::runtime_error("Corrupt file record"); + } + return rocks::KeyCodec::decode_be64(record.substr(20, 8)); +} + +std::string root_scalar_stats_key() { return "_root"; } +std::string root_category_counts_key() { return "_root"; } +std::string root_name_counts_key() { return "_root"; } +std::string root_pid_tid_counts_key() { return "_root"; } + +std::string tar_archive_key(int file_id) { return prefix_for_file(file_id); } + +std::string tar_file_key(int file_id, std::uint64_t uncompressed_offset, + std::string_view file_name) { + std::string key = prefix_for_file(file_id); + append_u64(key, uncompressed_offset); + key.push_back('\0'); + key.append(file_name); + return key; +} + +std::unordered_map decode_count_map_value( + std::string_view value) { + Cursor cursor(value); + std::unordered_map counts; + auto num_entries = cursor.u32(); + counts.reserve(num_entries); + for (std::uint32_t i = 0; i < num_entries; ++i) { + auto key = cursor.str(); + counts.emplace(std::move(key), cursor.u64()); + } + return counts; +} + +template +void for_each_count_map_entry(std::string_view value, Callback&& callback) { + Cursor cursor(value); + auto num_entries = cursor.u32(); + for (std::uint32_t i = 0; i < num_entries; ++i) { + auto key = cursor.str_view(); + auto count = cursor.u64(); + callback(key, count); + } +} + +template +void for_each_name_summary_entry(std::string_view value, Callback&& callback) { + Cursor cursor(value); + auto num_entries = cursor.u32(); + (void)cursor.u64(); // other_count + (void)cursor.u64(); // unique_count + for (std::uint32_t i = 0; i < num_entries; ++i) { + auto key = cursor.str_view(); + auto count = cursor.u64(); + callback(key, count); + } +} + +std::string encode_tar_archive_value(std::string_view archive_name, + std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size, + std::uint64_t total_files) { + std::string value; + append_string(value, archive_name); + append_u64(value, checkpoint_size); + append_u64(value, total_lines); + append_u64(value, total_uc_size); + append_u64(value, total_files); + return value; +} + +std::string encode_tar_file_value( + const IndexDatabaseWriterContext::TarFileRecord& record) { + std::string value; + append_u64(value, record.file_size); + append_u64(value, record.file_mtime); + append_u8(value, static_cast(record.typeflag)); + append_u64(value, record.data_offset); + return value; +} + +std::array decode_metadata_record(std::string_view value) { + Cursor cursor(value); + return {cursor.u64(), cursor.u64(), cursor.u64()}; +} + +std::string iterator_value(::rocksdb::Iterator& it) { + const auto slice = it.value(); + return std::string(slice.data(), slice.size()); +} + +std::string iterator_key(::rocksdb::Iterator& it) { + const auto slice = it.key(); + return std::string(slice.data(), slice.size()); +} + +template +void scan_prefix(const rocks::RocksDatabase& db, std::string_view column_family, + std::string_view prefix, Fn&& fn) { + internal::scan_prefix_iterator( + "Failed to scan RocksDB prefix", prefix, + [&] { return db.new_iterator(column_family); }, std::forward(fn)); +} + +} // namespace + +namespace internal { + +std::string encode_file_scalar_stats_value(const ChunkStatistics& stats, + std::uint64_t num_chunks) { + std::string value; + append_u64(value, stats.total_events); + append_u64(value, stats.min_timestamp_us); + append_u64(value, stats.max_timestamp_us); + append_i64(value, stats.duration_sum_us); + append_u64(value, stats.duration_min_us); + append_u64(value, stats.duration_max_us); + append_u64(value, stats.duration_count); + append_double(value, stats.duration_m2); + + auto duration_sketch = stats.duration_sketch.serialize(); + append_blob(value, duration_sketch); + + auto duration_histogram = stats.duration_histogram.to_json(); + append_string(value, duration_histogram); + + append_u64(value, num_chunks); + + auto ts_hist = stats.timestamp_histogram.serialize(); + append_blob(value, ts_hist); + + return value; +} + +std::string encode_root_scalar_stats_value( + const ChunkStatistics& stats, std::uint64_t num_chunks, + std::uint64_t num_files, std::uint64_t total_lines, + std::uint64_t total_uncompressed_bytes) { + auto value = encode_file_scalar_stats_value(stats, num_chunks); + append_u64(value, num_files); + append_u64(value, total_lines); + append_u64(value, total_uncompressed_bytes); + return value; +} + +MergedStatisticsResult decode_file_scalar_stats_value(std::string_view value) { + if (value.size() < 8) { + throw std::runtime_error("Corrupt file scalar statistics value"); + } + Cursor cursor(value); + MergedStatisticsResult result; + auto& stats = result.stats; + stats.total_events = cursor.u64(); + stats.min_timestamp_us = cursor.u64(); + stats.max_timestamp_us = cursor.u64(); + stats.duration_sum_us = cursor.i64(); + stats.duration_min_us = cursor.u64(); + stats.duration_max_us = cursor.u64(); + stats.duration_count = cursor.u64(); + stats.duration_m2 = cursor.f64(); + + auto duration_sketch = cursor.blob(); + if (!duration_sketch.empty()) { + stats.duration_sketch = common::statistics::DDSketch::deserialize( + duration_sketch.data(), duration_sketch.size()); + } + + auto duration_histogram = cursor.str(); + if (!duration_histogram.empty()) { + stats.duration_histogram = + common::statistics::Log2Histogram::from_json(duration_histogram); + } + + result.num_chunks = cursor.u64(); + + auto ts_hist_blob = cursor.blob(); + if (!ts_hist_blob.empty()) { + stats.timestamp_histogram = + common::statistics::TimestampHistogram::deserialize( + ts_hist_blob.data(), ts_hist_blob.size()); + } + + return result; +} + +RootStatisticsResult decode_root_scalar_stats_value(std::string_view value) { + Cursor cursor(value); + RootStatisticsResult result; + auto& stats = result.stats; + stats.total_events = cursor.u64(); + stats.min_timestamp_us = cursor.u64(); + stats.max_timestamp_us = cursor.u64(); + stats.duration_sum_us = cursor.i64(); + stats.duration_min_us = cursor.u64(); + stats.duration_max_us = cursor.u64(); + stats.duration_count = cursor.u64(); + stats.duration_m2 = cursor.f64(); + + auto duration_sketch = cursor.blob(); + if (!duration_sketch.empty()) { + stats.duration_sketch = common::statistics::DDSketch::deserialize( + duration_sketch.data(), duration_sketch.size()); + } + + auto duration_histogram = cursor.str(); + if (!duration_histogram.empty()) { + stats.duration_histogram = + common::statistics::Log2Histogram::from_json(duration_histogram); + } + + result.num_chunks = cursor.u64(); + + auto ts_hist_blob = cursor.blob(); + if (!ts_hist_blob.empty()) { + stats.timestamp_histogram = + common::statistics::TimestampHistogram::deserialize( + ts_hist_blob.data(), ts_hist_blob.size()); + } + + result.num_files = cursor.u64(); + result.total_lines = cursor.u64(); + result.total_uncompressed_bytes = cursor.u64(); + return result; +} + +} // namespace internal + +IndexDatabaseWriterContext::IndexDatabaseWriterContext( + std::shared_ptr db) + : db_(std::move(db)), batch_(db_->begin_batch()) {} + +IndexDatabaseWriterContext::IndexDatabaseWriterContext( + IndexDatabaseWriterContext&&) noexcept = default; + +IndexDatabaseWriterContext& IndexDatabaseWriterContext::operator=( + IndexDatabaseWriterContext&&) noexcept = default; + +IndexDatabaseWriterContext::~IndexDatabaseWriterContext() = default; + +void IndexDatabaseWriterContext::commit() { + if (committed_) return; + auto status = db_->commit_batch(batch_); + committed_ = true; + if (!status.ok()) { + throw std::runtime_error("Failed to commit WriteBatch: " + + status.ToString()); + } +} + +bool IndexDatabaseWriterContext::has_file_scalar_stats(int file_id) const { + std::string value; + auto status = + db_->get(file_scalar_stats_key(file_id), &value, cf::FILE_SCALAR_STATS); + return status.ok(); +} + +void IndexDatabaseWriterContext::init_schema() { + std::string value; + auto status = db_->get(schema_version_key(), &value); + if (status.IsNotFound()) { + status = db_->put(batch_, cf::DEFAULT, schema_version_key(), + rocks::KeyCodec::encode_be32(SCHEMA_VERSION)); + if (!status.ok()) { + throw_db_error("Failed to initialize schema version", status); + } + } else if (!status.ok()) { + throw_db_error("Failed to read schema version", status); + } +} + +void IndexDatabaseWriterContext::set_file_capabilities( + int file_id, IndexFileEntryCapability caps) { + std::string name; + auto status = db_->get(file_reverse_key(file_id), &name); + if (!status.ok()) return; + set_file_capabilities_by_path(name, caps); +} + +void IndexDatabaseWriterContext::set_file_capabilities_by_path( + std::string_view logical_path, IndexFileEntryCapability caps) { + auto key = file_lookup_key(logical_path); + std::string record; + auto status = db_->get(key, &record); + if (!status.ok() || record.size() < 5) return; + + record[4] = static_cast(static_cast(caps)); + db_->put(batch_, cf::DEFAULT, key, record); +} + +void IndexDatabaseWriterContext::add_file_capability( + int file_id, IndexFileEntryCapability cap) { + // Inline get_file_capabilities logic + IndexFileEntryCapability existing = IndexFileEntryCapability::NONE; + std::string name; + auto status = db_->get(file_reverse_key(file_id), &name); + if (status.ok()) { + std::string record; + status = db_->get(file_lookup_key(name), &record); + if (status.ok()) { + existing = decode_file_capabilities(record); + } + } + set_file_capabilities(file_id, existing | cap); +} + +int IndexDatabaseWriterContext::get_or_create_file_info( + std::string_view path, std::uint64_t file_hash, + IndexFileEntryCapability caps) { + const auto logical_name = std::string(path); + const auto lookup = file_lookup_key(logical_name); + std::string existing; + auto status = db_->get(lookup, &existing); + if (status.ok()) { + const auto file_id = decode_file_id(existing); + if (decode_file_hash(existing) == file_hash) { + if (caps != IndexFileEntryCapability::NONE && + decode_file_capabilities(existing) != caps) { + existing[4] = + static_cast(static_cast(caps)); + db_->put(batch_, cf::DEFAULT, lookup, existing); + } + return file_id; + } + delete_file_contents(file_id); + // Also delete the registry entries for this file + { + const auto logical_name_key = file_reverse_key(file_id); + std::string old_name; + auto rev_status = db_->get(logical_name_key, &old_name); + if (rev_status.ok()) { + db_->del(batch_, cf::DEFAULT, file_lookup_key(old_name)); + db_->del(batch_, cf::DEFAULT, logical_name_key); + } + // Delete root summaries + auto delete_prefix_fn = [&](std::string_view cf_name, + std::string_view prefix) { + std::vector keys; + scan_prefix(*db_, cf_name, prefix, + [&](::rocksdb::Iterator& it) { + keys.push_back(iterator_key(it)); + }); + for (const auto& k : keys) { + db_->del(batch_, cf_name, k); + } + }; + delete_prefix_fn(cf::ROOT_SCALAR_STATS, root_scalar_stats_key()); + delete_prefix_fn(cf::ROOT_CAT_COUNTS, root_category_counts_key()); + delete_prefix_fn(cf::ROOT_NAME_COUNTS, root_name_counts_key()); + delete_prefix_fn(cf::ROOT_PID_TID_COUNTS, + root_pid_tid_counts_key()); + } + auto registry = encode_file_record(file_id, file_hash, caps); + status = db_->put(batch_, cf::DEFAULT, lookup, registry); + if (!status.ok()) { + throw_db_error("Failed to update file registry", status); + } + status = db_->put(batch_, cf::DEFAULT, file_reverse_key(file_id), + logical_name); + if (!status.ok()) { + throw_db_error("Failed to update reverse file registry", status); + } + return file_id; + } + if (!status.IsNotFound()) { + throw_db_error("Failed to query file registry", status); + } + + std::uint32_t next_id; + if (cached_next_file_id_ >= 0) { + next_id = static_cast(cached_next_file_id_); + } else { + next_id = 1; + std::string next_value; + status = db_->get(next_file_id_key(), &next_value); + if (status.ok()) { + next_id = rocks::KeyCodec::decode_be32(next_value); + } else if (!status.IsNotFound()) { + throw_db_error("Failed to read next file id", status); + } + } + cached_next_file_id_ = static_cast(next_id + 1); + + const auto file_id = static_cast(next_id); + const auto new_registry = encode_file_record(file_id, file_hash, caps); + const auto next_registry = rocks::KeyCodec::encode_be32(next_id + 1); + + status = db_->put(batch_, cf::DEFAULT, lookup, new_registry); + if (!status.ok()) { + throw_db_error("Failed to insert file registry", status); + } + status = + db_->put(batch_, cf::DEFAULT, file_reverse_key(file_id), logical_name); + if (!status.ok()) { + throw_db_error("Failed to insert reverse file registry", status); + } + status = db_->put(batch_, cf::DEFAULT, next_file_id_key(), next_registry); + if (!status.ok()) { + throw_db_error("Failed to update next file id", status); + } + + return file_id; +} + +void IndexDatabaseWriterContext::insert_file_metadata( + int file_id, std::uint64_t checkpoint_size, std::uint64_t total_lines, + std::uint64_t total_uc_size) { + const auto key = metadata_key(file_id); + const auto value = + encode_metadata_record(checkpoint_size, total_lines, total_uc_size); + auto status = db_->put(batch_, cf::METADATA, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert metadata", status); + } +} + +void IndexDatabaseWriterContext::insert_chunk_bloom_filter( + int file_id, std::uint64_t checkpoint_idx, std::string_view dimension, + std::span blob_data, std::uint64_t num_entries) { + const auto key = chunk_bloom_key(file_id, dimension, checkpoint_idx); + const auto value = encode_bloom_value(blob_data, num_entries); + auto status = db_->put(batch_, cf::CHUNK_BLOOM, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert chunk bloom filter", status); + } +} + +void IndexDatabaseWriterContext::insert_chunk_bloom_filter( + int file_id, std::uint64_t checkpoint_idx, std::string_view dimension, + const void* blob_data, int blob_size, std::uint64_t num_entries) { + auto* bytes = static_cast(blob_data); + insert_chunk_bloom_filter(file_id, checkpoint_idx, dimension, + std::span( + bytes, static_cast(blob_size)), + num_entries); +} + +void IndexDatabaseWriterContext::insert_file_bloom_filter( + int file_id, std::string_view dimension, + std::span blob_data, std::uint64_t num_entries) { + const auto key = file_bloom_key(file_id, dimension); + const auto value = encode_bloom_value(blob_data, num_entries); + auto status = db_->put(batch_, cf::FILE_BLOOM, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert file bloom filter", status); + } +} + +void IndexDatabaseWriterContext::insert_file_bloom_filter( + int file_id, std::string_view dimension, const void* blob_data, + int blob_size, std::uint64_t num_entries) { + auto* bytes = static_cast(blob_data); + insert_file_bloom_filter(file_id, dimension, + std::span( + bytes, static_cast(blob_size)), + num_entries); +} + +void IndexDatabaseWriterContext::insert_chunk_statistics( + int file_id, std::uint64_t checkpoint_idx, const ChunkStatistics& stats) { + const auto key = chunk_stats_key(file_id, checkpoint_idx); + const auto value = encode_chunk_statistics_value(stats); + auto status = db_->put(batch_, cf::CHUNK_STATS, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert chunk statistics", status); + } +} + +void IndexDatabaseWriterContext::insert_file_scalar_stats( + int file_id, const ChunkStatistics& stats, std::uint64_t num_chunks) { + const auto key = file_scalar_stats_key(file_id); + const auto value = encode_file_scalar_stats_value(stats, num_chunks); + auto status = db_->put(batch_, cf::FILE_SCALAR_STATS, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert file scalar statistics", status); + } +} + +void IndexDatabaseWriterContext::insert_file_category_counts( + int file_id, const StringViewMap& counts) { + const auto key = file_category_counts_key(file_id); + const auto value = encode_count_map_value(counts); + auto status = db_->put(batch_, cf::FILE_CAT_COUNTS, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert file category counts", status); + } +} + +void IndexDatabaseWriterContext::insert_file_pid_tid_counts( + int file_id, const StringViewMap& counts) { + const auto key = file_pid_tid_counts_key(file_id); + const auto value = encode_count_map_value(counts); + auto status = db_->put(batch_, cf::FILE_PID_TID_COUNTS, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert file pid_tid counts", status); + } +} + +void IndexDatabaseWriterContext::insert_file_name_counts( + int file_id, const StringViewMap& counts) { + const auto key = file_name_counts_key(file_id); + const auto value = encode_name_summary_value(counts, 0, counts.size()); + auto status = db_->put(batch_, cf::FILE_NAME_COUNTS, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert file name counts", status); + } +} + +std::uint64_t IndexDatabaseWriterContext::get_or_create_name_id( + std::string_view name) { + const auto name_id = hash::fnv1a_hash(name); + insert_name_dictionary_entry(name_id, name); + return name_id; +} + +void IndexDatabaseWriterContext::insert_name_dictionary_entry( + std::uint64_t name_id, std::string_view name) { + const auto encoded_id = rocks::KeyCodec::encode_be64(name_id); + auto status = db_->put(batch_, cf::NAME_DICTIONARY, name_lookup_key(name), + encoded_id); + if (!status.ok()) { + throw_db_error("Failed to insert name dictionary lookup", status); + } + status = db_->put(batch_, cf::NAME_DICTIONARY, name_reverse_key(name_id), + std::string(name)); + if (!status.ok()) { + throw_db_error("Failed to insert name dictionary reverse", status); + } +} + +void IndexDatabaseWriterContext::insert_name_file_posting(std::uint64_t name_id, + int file_id) { + const auto key = name_file_posting_key(name_id, file_id); + const auto owner_key = name_file_owner_key(file_id, name_id); + auto status = db_->put(batch_, cf::NAME_FILE_POSTINGS, key, ""); + if (!status.ok()) { + throw_db_error("Failed to insert name file posting", status); + } + status = db_->put(batch_, cf::NAME_FILE_POSTINGS, owner_key, ""); + if (!status.ok()) { + throw_db_error("Failed to insert name file owner posting", status); + } +} + +void IndexDatabaseWriterContext::insert_name_chunk_posting( + std::uint64_t name_id, int file_id, std::uint64_t checkpoint_idx) { + const auto key = name_chunk_posting_key(name_id, file_id, checkpoint_idx); + const auto owner_key = + name_chunk_owner_key(file_id, name_id, checkpoint_idx); + auto status = db_->put(batch_, cf::NAME_CHUNK_POSTINGS, key, ""); + if (!status.ok()) { + throw_db_error("Failed to insert name chunk posting", status); + } + status = db_->put(batch_, cf::NAME_CHUNK_POSTINGS, owner_key, ""); + if (!status.ok()) { + throw_db_error("Failed to insert name chunk owner posting", status); + } +} + +void IndexDatabaseWriterContext::refresh_root_summaries_after_file_write( + [[maybe_unused]] int file_id, const ChunkStatistics& stats, + std::uint64_t num_chunks, bool had_existing_file_summary, + std::uint64_t file_lines, std::uint64_t file_uncompressed_bytes) { + auto put_root_scalar = [&](const RootStatisticsResult& root) { + auto value = encode_root_scalar_stats_value( + root.stats, root.num_chunks, root.num_files, root.total_lines, + root.total_uncompressed_bytes); + auto status = db_->put(batch_, cf::ROOT_SCALAR_STATS, + root_scalar_stats_key(), value); + if (!status.ok()) { + throw_db_error("Failed to write root scalar statistics", status); + } + }; + + auto put_root_counts = [&](std::string_view cf_name, std::string_view key, + const auto& counts, + std::string_view error_message) { + auto value = encode_count_map_value(counts); + auto status = db_->put(batch_, cf_name, key, value); + if (!status.ok()) { + throw_db_error(error_message, status); + } + }; + + if (had_existing_file_summary) { + rebuild_root_summaries(); + return; + } + + // Inline query_root_scalar_stats + std::optional root_scalar; + { + std::string value; + auto status = + db_->get(root_scalar_stats_key(), &value, cf::ROOT_SCALAR_STATS); + if (status.IsNotFound()) { + rebuild_root_summaries(); + return; + } + if (!status.ok()) { + throw_db_error("Failed to read root scalar statistics", status); + } + try { + DecodeContextGuard ctx("root_scalar_stats size=%zu", value.size()); + root_scalar = decode_root_scalar_stats_value(value); + } catch (const std::exception& e) { + throw std::runtime_error("Corrupt root_scalar_stats payload size=" + + std::to_string(value.size()) + ": " + + e.what()); + } + } + + root_scalar->stats.merge_from(stats); + root_scalar->num_chunks += num_chunks; + root_scalar->num_files += 1; + root_scalar->total_lines += file_lines; + root_scalar->total_uncompressed_bytes += file_uncompressed_bytes; + put_root_scalar(*root_scalar); + + // Inline query_root_category_counts + std::unordered_map category_counts; + { + std::string value; + auto status = + db_->get(root_category_counts_key(), &value, cf::ROOT_CAT_COUNTS); + if (status.ok()) { + try { + DecodeContextGuard ctx("root_cat_counts size=%zu", + value.size()); + category_counts = decode_count_map_value(value); + } catch (const std::exception& e) { + throw std::runtime_error( + "Corrupt root_cat_counts payload size=" + + std::to_string(value.size()) + ": " + e.what()); + } + } else if (!status.IsNotFound()) { + throw_db_error("Failed to read root category counts", status); + } + } + for (const auto& [key, count] : stats.category_counts) { + category_counts[key] += count; + } + put_root_counts(cf::ROOT_CAT_COUNTS, root_category_counts_key(), + category_counts, "Failed to write root category counts"); + + // Inline query_root_name_counts + std::unordered_map name_counts; + { + std::string value; + auto status = + db_->get(root_name_counts_key(), &value, cf::ROOT_NAME_COUNTS); + if (status.ok()) { + try { + DecodeContextGuard ctx("root_name_counts size=%zu", + value.size()); + name_counts = decode_count_map_value(value); + } catch (const std::exception& e) { + throw std::runtime_error( + "Corrupt root_name_counts payload size=" + + std::to_string(value.size()) + ": " + e.what()); + } + } else if (!status.IsNotFound()) { + throw_db_error("Failed to read root name counts", status); + } + } + for (const auto& [key, count] : stats.name_counts) { + name_counts[key] += count; + } + put_root_counts(cf::ROOT_NAME_COUNTS, root_name_counts_key(), name_counts, + "Failed to write root name counts"); + + // Inline query_root_pid_tid_counts + std::unordered_map pid_tid_counts; + { + std::string value; + auto status = db_->get(root_pid_tid_counts_key(), &value, + cf::ROOT_PID_TID_COUNTS); + if (status.ok()) { + try { + DecodeContextGuard ctx("root_pid_tid_counts size=%zu", + value.size()); + pid_tid_counts = decode_count_map_value(value); + } catch (const std::exception& e) { + throw std::runtime_error( + "Corrupt root_pid_tid_counts payload size=" + + std::to_string(value.size()) + ": " + e.what()); + } + } else if (!status.IsNotFound()) { + throw_db_error("Failed to read root pid_tid counts", status); + } + } + for (const auto& [key, count] : stats.pid_tid_counts) { + pid_tid_counts[key] += count; + } + put_root_counts(cf::ROOT_PID_TID_COUNTS, root_pid_tid_counts_key(), + pid_tid_counts, "Failed to write root pid_tid counts"); +} + +void IndexDatabaseWriterContext::rebuild_root_summaries() { + auto put_root_scalar = [&](const RootStatisticsResult& root) { + auto value = encode_root_scalar_stats_value( + root.stats, root.num_chunks, root.num_files, root.total_lines, + root.total_uncompressed_bytes); + auto status = db_->put(batch_, cf::ROOT_SCALAR_STATS, + root_scalar_stats_key(), value); + if (!status.ok()) { + throw_db_error("Failed to write root scalar statistics", status); + } + }; + + auto put_root_counts = [&](std::string_view cf_name, std::string_view key, + const auto& counts, + std::string_view error_message) { + auto value = encode_count_map_value(counts); + auto status = db_->put(batch_, cf_name, key, value); + if (!status.ok()) { + throw_db_error(error_message, status); + } + }; + + RootStatisticsResult rebuilt; + + // Inline query_all_file_info_ids + std::unordered_map all_files; + internal::scan_prefix_iterator( + "Failed to scan file registry", "f|", + [this] { return db_->new_iterator(); }, + [&](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + auto value = iterator_value(it); + all_files.emplace(key.substr(2), decode_file_id(value)); + }); + + std::vector file_ids; + file_ids.reserve(all_files.size()); + for (const auto& [_, existing_file_id] : all_files) { + file_ids.push_back(existing_file_id); + } + + rebuilt.num_files = static_cast(file_ids.size()); + + if (!file_ids.empty()) { + std::unordered_set wanted(file_ids.begin(), file_ids.end()); + const auto [min_it, max_it] = + std::minmax_element(file_ids.begin(), file_ids.end()); + const auto min_prefix = prefix_for_file(*min_it); + const int max_file_id = *max_it; + + auto it = db_->new_iterator(cf::FILE_SCALAR_STATS); + for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size())); + it->Valid(); it->Next()) { + auto key = iterator_key(*it); + int fid = decode_prefixed_file_id(key); + if (fid > max_file_id) break; + if (!wanted.contains(fid)) continue; + + auto value = iterator_value(*it); + try { + DecodeContextGuard ctx("file_scalar_stats file_id=%d size=%zu", + fid, value.size()); + auto row = decode_file_scalar_stats_value(value); + rebuilt.stats.merge_from(row.stats); + rebuilt.num_chunks += row.num_chunks; + } catch (const std::exception& e) { + throw std::runtime_error( + "Corrupt file_scalar_stats payload file_id=" + + std::to_string(fid) + + " size=" + std::to_string(value.size()) + ": " + e.what()); + } + } + const auto status = it->status(); + if (!status.ok()) { + throw_db_error("Failed to scan file scalar statistics", status); + } + } + + // Inline query_file_metadata_batch + if (!file_ids.empty()) { + std::unordered_set wanted(file_ids.begin(), file_ids.end()); + const auto [min_it, max_it] = + std::minmax_element(file_ids.begin(), file_ids.end()); + const auto min_prefix = prefix_for_file(*min_it); + const int max_file_id = *max_it; + + auto it = db_->new_iterator(cf::METADATA); + for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size())); + it->Valid(); it->Next()) { + auto key = iterator_key(*it); + int fid = decode_prefixed_file_id(key); + if (fid > max_file_id) { + break; + } + if (!wanted.contains(fid)) { + continue; + } + + auto value = iterator_value(*it); + DecodeContextGuard ctx("metadata file_id=%d size=%zu", fid, + value.size()); + auto decoded = decode_metadata_record(value); + rebuilt.total_lines += decoded[1]; + rebuilt.total_uncompressed_bytes += decoded[2]; + } + + const auto status = it->status(); + if (!status.ok()) { + throw IndexerError( + IndexerError::Type::DATABASE_ERROR, + "Failed to batch read file metadata: " + status.ToString()); + } + } + + if (!file_ids.empty()) { + std::unordered_set wanted(file_ids.begin(), file_ids.end()); + const auto [min_it, max_it] = + std::minmax_element(file_ids.begin(), file_ids.end()); + const auto min_prefix = prefix_for_file(*min_it); + const int max_file_id = *max_it; + + auto scan_counts = [&](std::string_view cf_name, + std::string_view error_message, auto& target_map, + auto for_each_entry_fn) { + auto it = db_->new_iterator(cf_name); + for (it->Seek( + ::rocksdb::Slice(min_prefix.data(), min_prefix.size())); + it->Valid(); it->Next()) { + auto key = iterator_key(*it); + int fid = decode_prefixed_file_id(key); + if (fid > max_file_id) break; + if (!wanted.contains(fid)) continue; + + auto value = iterator_value(*it); + DecodeContextGuard ctx("%.*s merge file_id=%d size=%zu", + static_cast(cf_name.size()), + cf_name.data(), fid, value.size()); + for_each_entry_fn(value, [&target_map](std::string_view k, + std::uint64_t count) { + auto entry = target_map.try_emplace(std::string(k), 0); + entry.first->second += count; + }); + } + const auto status = it->status(); + if (!status.ok()) { + throw_db_error(std::string(error_message), status); + } + }; + + scan_counts(cf::FILE_CAT_COUNTS, "Failed to scan file category counts", + rebuilt.stats.category_counts, + [](std::string_view v, auto cb) { + for_each_count_map_entry(v, cb); + }); + scan_counts( + cf::FILE_PID_TID_COUNTS, "Failed to scan file pid_tid counts", + rebuilt.stats.pid_tid_counts, [](std::string_view v, auto cb) { + for_each_count_map_entry(v, cb); + }); + scan_counts(cf::FILE_NAME_COUNTS, "Failed to scan file name counts", + rebuilt.stats.name_counts, [](std::string_view v, auto cb) { + for_each_name_summary_entry(v, cb); + }); + } + + put_root_scalar(rebuilt); + put_root_counts(cf::ROOT_CAT_COUNTS, root_category_counts_key(), + rebuilt.stats.category_counts, + "Failed to write root category counts"); + put_root_counts(cf::ROOT_NAME_COUNTS, root_name_counts_key(), + rebuilt.stats.name_counts, + "Failed to write root name counts"); + put_root_counts(cf::ROOT_PID_TID_COUNTS, root_pid_tid_counts_key(), + rebuilt.stats.pid_tid_counts, + "Failed to write root pid_tid counts"); +} + +void IndexDatabaseWriterContext::insert_checkpoint( + int file_id, const IndexerCheckpoint& checkpoint) { + const auto key = checkpoint_key(file_id, checkpoint.uc_offset, + checkpoint.checkpoint_idx); + const auto value = encode_checkpoint_value(checkpoint); + auto status = db_->put(batch_, rocks::cf::CHECKPOINTS, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert checkpoint", status); + } +} + +void IndexDatabaseWriterContext::insert_index_dimension( + int file_id, std::string_view dimension) { + const auto key = make_dimension_key(file_id, dimension); + auto status = db_->put(batch_, cf::DIMENSIONS, key, ""); + if (!status.ok()) { + throw_db_error("Failed to insert index dimension", status); + } +} + +void IndexDatabaseWriterContext::insert_hash_table_entry( + std::uint8_t type, std::string_view hash, std::string_view name) { + db_->put(batch_, cf::HASH_TABLES, + encoding::hash_table_forward_key(type, hash), name); + db_->put(batch_, cf::HASH_TABLES, + encoding::hash_table_reverse_key(type, name), hash); +} + +void IndexDatabaseWriterContext::insert_aggregation_merge( + std::string_view key, std::string_view operand) { + auto status = db_->merge(batch_, cf::AGGREGATION, key, operand); + if (!status.ok()) { + throw_db_error("Failed to merge aggregation operand", status); + } +} + +void IndexDatabaseWriterContext::insert_aggregation_put( + std::string_view key, std::string_view value) { + auto status = db_->put(batch_, cf::AGGREGATION, key, value); + if (!status.ok()) { + throw_db_error("Failed to put aggregation value", status); + } +} + +void IndexDatabaseWriterContext::insert_system_metrics_merge( + std::string_view key, std::string_view operand) { + auto status = db_->merge(batch_, cf::SYSTEM_METRICS, key, operand); + if (!status.ok()) { + throw_db_error("Failed to merge system metrics operand", status); + } +} + +void IndexDatabaseWriterContext::insert_chunk_dimension_stats( + int file_id, std::uint64_t checkpoint_idx, const ChunkDimensionStats& stats, + std::size_t value_counts_cap) { + const auto key = + chunk_dim_stats_key(file_id, checkpoint_idx, stats.dimension); + const auto value = + encode_chunk_dimension_stats_value(stats, value_counts_cap); + auto status = db_->put(batch_, cf::CHUNK_DIM_STATS, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert chunk dimension stats", status); + } +} + +void IndexDatabaseWriterContext::insert_tar_archive_metadata( + int file_id, std::string_view archive_name, std::uint64_t checkpoint_size, + std::uint64_t total_lines, std::uint64_t total_uc_size, + std::uint64_t total_files) { + const auto key = tar_archive_key(file_id); + const auto value = encode_tar_archive_value( + archive_name, checkpoint_size, total_lines, total_uc_size, total_files); + auto status = db_->put(batch_, cf::ARCHIVES, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert tar archive metadata", status); + } +} + +void IndexDatabaseWriterContext::insert_tar_file(int file_id, + const TarFileRecord& record) { + const auto key = + tar_file_key(file_id, record.uncompressed_offset, record.file_name); + const auto value = encode_tar_file_value(record); + auto status = db_->put(batch_, cf::TAR_FILES, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert tar file metadata", status); + } +} + +void IndexDatabaseWriterContext::delete_chunk_bloom_filters( + int file_id, std::string_view dimension) { + std::vector keys; + std::string prefix = prefix_for_file(file_id); + prefix.append(dimension); + prefix.push_back('\0'); + scan_prefix(*db_, cf::CHUNK_BLOOM, prefix, [&](::rocksdb::Iterator& it) { + keys.push_back(iterator_key(it)); + }); + for (const auto& key : keys) { + auto status = db_->del(batch_, cf::CHUNK_BLOOM, key); + if (!status.ok()) + throw_db_error("Failed to delete chunk bloom", status); + } +} + +void IndexDatabaseWriterContext::delete_file_bloom_filter( + int file_id, std::string_view dimension) { + auto status = + db_->del(batch_, cf::FILE_BLOOM, file_bloom_key(file_id, dimension)); + if (!status.ok() && !status.IsNotFound()) { + throw_db_error("Failed to delete file bloom", status); + } +} + +void IndexDatabaseWriterContext::delete_chunk_statistics(int file_id) { + std::vector keys; + scan_prefix( + *db_, cf::CHUNK_STATS, prefix_for_file(file_id), + [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); }); + for (const auto& key : keys) { + auto status = db_->del(batch_, cf::CHUNK_STATS, key); + if (!status.ok()) { + throw_db_error("Failed to delete chunk statistics", status); + } + } +} + +void IndexDatabaseWriterContext::delete_chunk_dimension_stats(int file_id) { + std::vector keys; + scan_prefix( + *db_, cf::CHUNK_DIM_STATS, prefix_for_file(file_id), + [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); }); + for (const auto& key : keys) { + auto status = db_->del(batch_, cf::CHUNK_DIM_STATS, key); + if (!status.ok()) { + throw_db_error("Failed to delete chunk dimension stats", status); + } + } +} + +void IndexDatabaseWriterContext::delete_file_contents(int file_id) { + auto delete_prefix = [&](std::string_view cf_name, + std::string_view prefix) { + std::vector keys; + scan_prefix(*db_, cf_name, prefix, [&](::rocksdb::Iterator& it) { + keys.push_back(iterator_key(it)); + }); + for (const auto& key : keys) { + auto del_status = db_->del(batch_, cf_name, key); + if (!del_status.ok() && !del_status.IsNotFound()) { + throw_db_error("Failed to delete file-scoped RocksDB data", + del_status); + } + } + }; + + auto delete_name_postings_by_owner = [&](std::string_view cf_name, + int owner_fid, + std::string_view prefix, + bool chunk_level) { + std::vector owner_keys; + scan_prefix(*db_, cf_name, prefix, [&](::rocksdb::Iterator& it) { + owner_keys.push_back(iterator_key(it)); + }); + for (const auto& owner_key : owner_keys) { + if (owner_key.size() < prefix.size() + 4) { + continue; + } + const std::string_view payload(owner_key.data() + prefix.size(), + owner_key.size() - prefix.size()); + if ((!chunk_level && payload.size() != 4) || + (chunk_level && payload.size() != 12)) { + continue; + } + + std::string primary_key("n|"); + primary_key.append(payload.data(), 4); + rocks::KeyCodec::append_be32(primary_key, + static_cast(owner_fid)); + if (chunk_level) { + primary_key.append(payload.data() + 4, payload.size() - 4); + } + + auto del_one = [&](std::string_view key) { + auto del_status = db_->del(batch_, cf_name, key); + if (!del_status.ok() && !del_status.IsNotFound()) { + throw_db_error("Failed to delete exact name posting", + del_status); + } + }; + + del_one(primary_key); + del_one(owner_key); + } + }; + + delete_prefix(rocks::cf::CHECKPOINTS, prefix_for_file(file_id)); + delete_prefix(cf::METADATA, prefix_for_file(file_id)); + delete_prefix(cf::ARCHIVES, prefix_for_file(file_id)); + delete_prefix(cf::TAR_FILES, prefix_for_file(file_id)); + delete_prefix(cf::CHUNK_BLOOM, prefix_for_file(file_id)); + delete_prefix(cf::FILE_BLOOM, prefix_for_file(file_id)); + delete_prefix(cf::CHUNK_STATS, prefix_for_file(file_id)); + delete_prefix(cf::CHUNK_DIM_STATS, prefix_for_file(file_id)); + delete_prefix(cf::FILE_SCALAR_STATS, prefix_for_file(file_id)); + delete_prefix(cf::FILE_CAT_COUNTS, prefix_for_file(file_id)); + delete_prefix(cf::FILE_NAME_COUNTS, prefix_for_file(file_id)); + delete_prefix(cf::FILE_PID_TID_COUNTS, prefix_for_file(file_id)); + delete_name_postings_by_owner(cf::NAME_FILE_POSTINGS, file_id, + name_file_owner_prefix(file_id), false); + delete_name_postings_by_owner(cf::NAME_CHUNK_POSTINGS, file_id, + name_chunk_owner_prefix(file_id), true); + delete_prefix(cf::DIMENSIONS, std::string("d|") + prefix_for_file(file_id)); + delete_prefix(cf::MANIFEST, std::string("E|") + prefix_for_file(file_id)); + delete_prefix(cf::MANIFEST, std::string("M|") + prefix_for_file(file_id)); + delete_prefix(cf::MANIFEST, std::string("P|") + prefix_for_file(file_id)); +} + +void IndexDatabaseWriterContext::insert_event_range( + int file_id, std::uint64_t checkpoint_idx, std::string_view cat, + std::string_view name, std::span line_numbers) { + const auto key = manifest_event_key(file_id, checkpoint_idx, cat, name); + const auto value = encode_event_range_value(line_numbers); + auto status = db_->put(batch_, cf::MANIFEST, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert event range", status); + } +} + +void IndexDatabaseWriterContext::insert_metadata_lines( + int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type, + std::span line_numbers) { + const auto key = manifest_metadata_key(file_id, checkpoint_idx, meta_type); + const auto value = encode_metadata_value(line_numbers); + auto status = db_->put(batch_, cf::MANIFEST, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert metadata lines", status); + } +} + +void IndexDatabaseWriterContext::insert_file_pids( + int file_id, const std::unordered_set& pids) { + const auto key = encoding::file_pids_key(file_id); + const auto value = encoding::encode_file_pids_value(pids); + auto status = db_->put(batch_, cf::MANIFEST, key, value); + if (!status.ok()) { + throw_db_error("Failed to insert file PIDs", status); + } +} + +void IndexDatabaseWriterContext::delete_event_ranges(int file_id) { + std::vector keys; + std::string prefix("E|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) { + keys.push_back(iterator_key(it)); + }); + for (const auto& key : keys) { + auto status = db_->del(batch_, cf::MANIFEST, key); + if (!status.ok()) { + throw_db_error("Failed to delete manifest event ranges", status); + } + } +} + +void IndexDatabaseWriterContext::delete_metadata_lines(int file_id) { + std::vector keys; + std::string prefix("M|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) { + keys.push_back(iterator_key(it)); + }); + for (const auto& key : keys) { + auto status = db_->del(batch_, cf::MANIFEST, key); + if (!status.ok()) { + throw_db_error("Failed to delete metadata lines", status); + } + } +} + +} // namespace dftracer::utils::utilities::indexer diff --git a/src/dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h b/src/dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h index 3c991f9a..d2842d10 100644 --- a/src/dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h +++ b/src/dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h @@ -72,18 +72,43 @@ class GzipInflater : public Inflater { * Read and analyze data for indexing purposes. * Uses Z_BLOCK to detect deflate boundaries and counts lines. */ - coro::CoroTask read(int fd, off_t& offset, - GzipInflaterResult& result) { + coro::CoroTask read(int fd, off_t& offset, GzipInflaterResult& result, + std::size_t max_input_bytes = 0) { + co_return co_await read_into(fd, offset, out_buffer(), BUFFER_SIZE, + result, max_input_bytes); + } + + /** + * Like read() but writes uncompressed output directly into the + * caller-provided buffer. Enables zero-copy hand-off to downstream + * consumers that own their own memory (e.g. parallel-inflate worker + * pools that cycle buffers through a channel without memcpy). + * + * The caller must keep `out_buf` alive for the duration of this call + * and not read it until the coroutine resumes with a successful + * result. + */ + coro::CoroTask read_into(int fd, off_t& offset, + unsigned char* out_buf, std::size_t out_cap, + GzipInflaterResult& result, + std::size_t max_input_bytes = 0) { result = {0, 0, false, 0}; - stream.next_out = out_buffer; - stream.avail_out = sizeof(out_buffer); + stream.next_out = out_buf; + stream.avail_out = static_cast(out_cap); while (stream.avail_out > 0) { // Read input if needed if (stream.avail_in == 0) { + std::size_t to_read = BUFFER_SIZE; + if (max_input_bytes != 0) { + if (total_input_bytes_ >= max_input_bytes) break; + const std::size_t remaining = + max_input_bytes - total_input_bytes_; + if (remaining < to_read) to_read = remaining; + } ssize_t n = co_await ::dftracer::utils::io::pread( - fd, in_buffer, sizeof(in_buffer), offset); + fd, in_buffer(), to_read, offset); if (n == 0) { break; // EOF } @@ -94,7 +119,7 @@ class GzipInflater : public Inflater { co_return false; // Return error } offset += n; - stream.next_in = in_buffer; + stream.next_in = in_buffer(); stream.avail_in = static_cast(n); total_input_bytes_ += static_cast(n); } @@ -108,17 +133,15 @@ class GzipInflater : public Inflater { stream.msg ? stream.msg : "no message"); break; } - // NOTE: inflateReset clears the zlib - // sliding window (state->whave = 0). If we continued filling - // the output buffer, the next deflate-block boundary would - // pass the "avail_out < sizeof" check (using pre-reset - // output) while inflateGetDictionary returns an empty window. - // Breaking here lets the caller consume the output produced - // so far, and the NEXT read() call starts with fresh output - // accounting so block-boundary checks only reflect post-reset - // output where the window is valid. + // If the member produced no output (e.g. an FEXTRA-only + // padding member emitted by the padded-striped writer), + // keep inflating so the caller never sees a spurious 0-byte + // read (which it would treat as EOF). Otherwise break so + // the caller can consume the output and we re-enter with + // fresh window-accounting post-reset. result.at_block_boundary = false; - break; + if (stream.avail_out < out_cap) break; + continue; } if (ret != Z_OK) { DFTRACER_UTILS_LOG_DEBUG( @@ -135,14 +158,14 @@ class GzipInflater : public Inflater { // before any data is decompressed. if ((stream.data_type & 0xc0) == 0x80) { result.at_block_boundary = true; - if (stream.avail_out < sizeof(out_buffer)) { + if (stream.avail_out < out_cap) { break; } } } - result.bytes_read = sizeof(out_buffer) - stream.avail_out; - result.lines_found = count_lines(out_buffer, result.bytes_read); + result.bytes_read = out_cap - stream.avail_out; + result.lines_found = count_lines(out_buf, result.bytes_read); result.input_bytes_consumed = total_input_bytes_ - stream.avail_in; co_return true; diff --git a/src/dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h b/src/dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h new file mode 100644 index 00000000..2df746a1 --- /dev/null +++ b/src/dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h @@ -0,0 +1,107 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_COMMON_GZIP_MEMBER_SCANNER_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_COMMON_GZIP_MEMBER_SCANNER_H + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer::internal { + +struct GzipMember { + std::uint64_t c_offset; // compressed byte offset of the 1F 8B header + std::uint64_t c_size; // compressed size of this member (bytes) +}; + +/// Cheap validation of a 10-byte gzip header candidate starting at buf[i]. +/// Rejects patterns that look like 1F 8B 08 but don't parse as a real +/// gzip header (FLG reserved bits, XFL, OS sanity). Callers must have +/// `end - buf[i] >= 10`. +inline bool gzip_header_looks_valid(const unsigned char* buf) noexcept { + if (buf[0] != 0x1F || buf[1] != 0x8B || buf[2] != 0x08) return false; + const unsigned char flg = buf[3]; + if (flg & 0xE0) return false; // reserved bits must be zero + const unsigned char xfl = buf[8]; + if (xfl != 0 && xfl != 2 && xfl != 4) return false; + const unsigned char os = buf[9]; + if (os > 13 && os != 255) return false; + return true; +} + +/// Scan `fd` in buffered pread windows, collecting compressed byte offsets +/// of every candidate gzip header. On return, `out` contains at least +/// one entry (offset 0 if the file starts with a valid gzip header), or +/// is empty if the file is not a gzip stream. `c_size` is populated as +/// the gap to the next member's offset; the last member's `c_size` +/// extends to file end. +/// +/// False positives (the byte pattern appearing inside compressed data +/// with a plausible header) are possible; callers must treat a returned +/// list as "candidate" and validate at inflate time. +inline coro::CoroTask enumerate_gzip_member_candidates( + int fd, std::uint64_t file_size, std::vector& out) { + out.clear(); + if (file_size < 18) + co_return false; // min gzip: 10 header + 2 deflate + 8 trailer + + // Window size tuned for Lustre sequential reads; keep a small overlap + // so a header straddling a window boundary is still seen by the scan. + constexpr std::size_t WIN = 1 << 20; // 1 MiB + constexpr std::size_t OVERLAP = 16; // >= gzip fixed header size + + std::vector buf(WIN); + std::uint64_t pos = 0; + std::uint64_t carry = 0; // how many bytes at buf[0..carry) are stale + + while (pos < file_size) { + const std::size_t want = + std::min(WIN - carry, file_size - pos); + ssize_t n = co_await dftracer::utils::io::pread( + fd, buf.data() + carry, want, static_cast(pos)); + if (n <= 0) { + if (out.empty()) co_return false; + break; + } + const std::size_t avail = carry + static_cast(n); + const std::uint64_t base = pos - carry; + + const std::size_t scan_end = (avail >= 10) ? (avail - 9) : 0; + for (std::size_t i = 0; i < scan_end; ++i) { + if (buf[i] != 0x1F) continue; + if (gzip_header_looks_valid(buf.data() + i)) { + out.push_back({base + i, 0}); + } + } + + pos += static_cast(n); + + // Copy the trailing OVERLAP bytes to the front so a header + // straddling the next read is still caught. + if (avail >= OVERLAP) { + std::memmove(buf.data(), buf.data() + avail - OVERLAP, OVERLAP); + carry = OVERLAP; + } else { + carry = avail; + std::memmove(buf.data(), buf.data() + (avail - carry), carry); + } + } + + if (out.empty()) co_return false; + + // Fill c_size: gap between consecutive candidates; last one extends to EOF. + for (std::size_t i = 0; i + 1 < out.size(); ++i) { + out[i].c_size = out[i + 1].c_offset - out[i].c_offset; + } + out.back().c_size = file_size - out.back().c_offset; + + co_return true; +} + +} // namespace dftracer::utils::utilities::indexer::internal + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_COMMON_GZIP_MEMBER_SCANNER_H diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp index 68f102b5..83628452 100644 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp +++ b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp @@ -2,28 +2,35 @@ #include #include #include +#include #include -#include +#include #include +#include #include #include #include #include +#include #include #include #include #include #include +#include #include #include +#include +#include #include #include +#include +#include namespace dftracer::utils::utilities::indexer::internal::gzip { using dftracer::utils::utilities::indexer::IndexDatabase; -namespace rocks = dftracer::utils::rocksdb; namespace { @@ -50,7 +57,7 @@ void finalize_checkpoints(std::vector& checkpoints, } } -static dftracer::utils::coro::CoroTask process_chunks( +static dftracer::utils::coro::CoroTask process_chunks_serial( int fd, std::uint64_t ckpt_size, std::uint64_t& total_lines, std::uint64_t& total_uc_size, std::uint64_t& tail_line_count, std::vector& checkpoints, @@ -67,7 +74,6 @@ static dftracer::utils::coro::CoroTask process_chunks( std::uint64_t line_count_in_chunk = 0; std::uint64_t first_line_in_chunk = total_lines + 1; - std::string line_buf; const bool has_visitors = !visitors.empty(); while (true) { @@ -88,25 +94,16 @@ static dftracer::utils::coro::CoroTask process_chunks( line_count_in_chunk += result.lines_found; if (has_visitors) { - const auto* data = inflater.out_buffer; - const std::size_t n = result.bytes_read; - std::size_t seg_start = 0; - for (std::size_t i = 0; i < n; ++i) { - if (data[i] == '\n') { - line_buf.append( - reinterpret_cast(data + seg_start), - i - seg_start); - std::string_view line_sv(line_buf); - for (auto& visitor : visitors) { - visitor.get().on_line(line_sv, checkpoint_idx); - } - line_buf.clear(); - seg_start = i + 1; - } + const char* data = + reinterpret_cast(inflater.out_buffer()); + for (auto& visitor : visitors) { + co_await visitor.get().on_chunk(data, result.bytes_read, + checkpoint_idx); } - if (seg_start < n) { - line_buf.append(reinterpret_cast(data + seg_start), - n - seg_start); + for (auto& visitor : visitors) { + if (visitor.get().wants_drain()) { + co_await visitor.get().drain_pending(); + } } } @@ -135,7 +132,8 @@ static dftracer::utils::coro::CoroTask process_chunks( if (has_visitors) { for (auto& visitor : visitors) { - visitor.get().on_checkpoint(checkpoint_idx - 1); + co_await visitor.get().on_checkpoint( + checkpoint_idx - 1); } } @@ -147,17 +145,445 @@ static dftracer::utils::coro::CoroTask process_chunks( } } + if (has_visitors) { + for (auto& visitor : visitors) { + co_await visitor.get().flush(); + } + } + total_uc_size = current_uc_offset; tail_line_count = line_count_in_chunk; co_return true; } -static dftracer::utils::coro::CoroTask build_index( - IndexDatabase& db, int file_id, const std::string& gz_path, - std::uint64_t ckpt_size, const Indexer::VisitorList& visitors) { +// -- Parallel path --------------------------------------------------------- +// +// When the file is multi-member gzip (the dftracer runtime format), divide +// the members across N worker coroutines and stream inflated chunks through +// per-worker channels to a single dispatcher. +// +// Checkpoints are emitted by workers at mid-range deflate-block boundaries +// using GzipCheckpointer (identical semantics to the serial path). The +// dispatcher finalises each checkpoint with global uc_offset / line numbers +// and pushes it into the shared `checkpoints` vector in order. + +struct ParallelInflateMsg { + std::unique_ptr> data; + // Per-worker monotonic sequence. Load-bearing: moodycamel (our channel + // backend) does not guarantee strict FIFO without producer tokens, so + // the dispatcher reorders by this before handing chunks to visitors. + std::uint64_t seq = 0; + std::uint64_t lines = 0; + bool has_checkpoint = false; + std::vector dict_compressed; + int bits = 0; + std::uint64_t ckpt_c_offset = 0; +}; + +using ParallelChan = dftracer::utils::coro::Channel; + +static dftracer::utils::coro::CoroTask parallel_worker( + int fd, std::uint64_t range_c_start, std::uint64_t range_c_end, + std::uint64_t ckpt_size, bool strip_leading_partial, + bool extend_to_newline_past_end, + dftracer::utils::coro::ChannelProducer producer) { + auto guard = producer.guard(); + + GzipInflater inflater; + if (!(co_await inflater.initialize(fd, range_c_start))) { + co_return false; + } + + off_t offset = static_cast(range_c_start); + const std::uint64_t range_c_size = range_c_end - range_c_start; + std::uint64_t local_uc = 0; + std::uint64_t last_ckpt_uc = 0; + std::uint64_t seq = 0; + bool leading_partial_done = !strip_leading_partial; + // When extending past end: once get_total_input_consumed() hits + // range_c_size, we keep reading (uncapped) until the uncompressed + // output contains a `\n`; the byte right after that `\n` belongs + // to the next slice, so we truncate this final chunk there. This + // captures the single split line that straddles the slice boundary + // so it isn't double-counted (the next slice strips its own leading + // partial prefix). + bool extending = false; + bool emit_done = false; + + while (true) { + GzipInflaterResult result; + // Cap pread at range_c_size normally. While extending (slice's + // last worker that must consume the straddling split line) the + // cap is lifted so we can read into the next slice's bytes far + // enough to find a `\n`. + const std::size_t input_cap = extending ? 0 : range_c_size; + if (!(co_await inflater.read(fd, offset, result, input_cap))) { + co_return false; + } + if (result.bytes_read == 0) { + // `bytes_read==0` here can mean either true EOF or that the + // input cap was hit mid-inflate (inflater break'd out with + // no new output). For the latter, if this worker still + // needs to swallow the straddling split line, flip into + // extend mode and retry with an uncapped read. + if (extend_to_newline_past_end && !emit_done && !extending) { + extending = true; + continue; + } + break; + } + + // First-chunk leading-partial-line strip: when this worker is + // the first of a mid-file slice (range_c_start at a non-initial + // member's header), the first inflated bytes continue a line + // from the previous member (not owned by this slice). Skip up + // to and including the first `\n` so the dispatcher sees a + // clean line-boundary start. + std::size_t skip_prefix = 0; + if (!leading_partial_done) { + const unsigned char* out = inflater.out_buffer(); + for (std::size_t i = 0; i < result.bytes_read; ++i) { + if (out[i] == '\n') { + skip_prefix = i + 1; + leading_partial_done = true; + break; + } + } + if (!leading_partial_done) { + // No newline in this chunk; entire chunk is continuation + // of the previous slice's line. Count bytes but emit + // nothing and loop for more. + local_uc += result.bytes_read; + if (inflater.get_total_input_consumed() >= range_c_size) break; + continue; + } + } + + std::size_t emit_len = result.bytes_read - skip_prefix; + local_uc += result.bytes_read; + + // Extending past end: truncate the emit buffer at the first `\n` + // at/after the boundary. Everything after that belongs to the + // next slice. + if (extending && !emit_done) { + const unsigned char* p = inflater.out_buffer() + skip_prefix; + for (std::size_t i = 0; i < emit_len; ++i) { + if (p[i] == '\n') { + emit_len = i + 1; + emit_done = true; + break; + } + } + } + + ParallelInflateMsg msg; + msg.seq = seq++; + // Line count adjusted for stripped prefix: approximate by + // counting newlines in the emitted region (cheap enough; pipeline + // uses counts only for statistics, not for correctness). + if (skip_prefix == 0) { + msg.lines = result.lines_found; + } else { + std::uint64_t lines = 0; + const unsigned char* p = inflater.out_buffer() + skip_prefix; + for (std::size_t i = 0; i < emit_len; ++i) { + if (p[i] == '\n') ++lines; + } + msg.lines = lines; + } + msg.data = std::make_unique>( + inflater.out_buffer() + skip_prefix, + inflater.out_buffer() + skip_prefix + emit_len); + + if (result.at_block_boundary && ckpt_size > 0 && + local_uc - last_ckpt_uc >= ckpt_size) { + const std::uint64_t absolute_c = + range_c_start + inflater.get_total_input_consumed(); + GzipCheckpointer cp(inflater, static_cast(local_uc)); + if (cp.create(static_cast(absolute_c))) { + std::vector dict; + if (cp.compress(dict)) { + msg.has_checkpoint = true; + msg.dict_compressed = std::move(dict); + msg.bits = cp.bits; + msg.ckpt_c_offset = absolute_c; + last_ckpt_uc = local_uc; + } + } + } + + if (!(co_await producer.send(std::move(msg)))) break; + + if (inflater.get_total_input_consumed() >= range_c_size) { + // Normal worker: stop at slice boundary. + // Extending worker: stop after we've emitted up to and + // including the first `\n` past the boundary. + if (!extend_to_newline_past_end) break; + if (emit_done) break; + extending = true; + } + } + + co_return true; +} + +static dftracer::utils::coro::CoroTask parallel_dispatcher( + const std::vector>& chans, + std::uint64_t checkpoint_idx_base, std::uint64_t& total_lines, + std::uint64_t& total_uc_size, std::uint64_t& tail_line_count, + std::vector& checkpoints, + const Indexer::VisitorList& visitors) { + const bool has_visitors = !visitors.empty(); + std::uint64_t checkpoint_idx = checkpoint_idx_base; + std::uint64_t global_uc = 0; + std::uint64_t line_count_in_chunk = 0; + std::uint64_t first_line_in_chunk = total_lines + 1; + + std::uint64_t total_chunks_received = 0; + + auto process_msg = + [&](ParallelInflateMsg& msg) -> dftracer::utils::coro::CoroTask { + const std::size_t data_len = msg.data ? msg.data->size() : 0; + ++total_chunks_received; + + if (has_visitors && data_len > 0) { + const char* data = reinterpret_cast(msg.data->data()); + for (auto& v : visitors) { + co_await v.get().on_chunk(data, data_len, checkpoint_idx); + } + } + + global_uc += data_len; + total_lines += msg.lines; + line_count_in_chunk += msg.lines; + + if (msg.has_checkpoint) { + IndexerCheckpoint checkpoint{ + .checkpoint_idx = checkpoint_idx++, + .uc_offset = global_uc, + .uc_size = 0, + .c_offset = msg.ckpt_c_offset, + .c_size = 0, + .bits = msg.bits, + .dict_compressed = std::move(msg.dict_compressed), + .num_lines = line_count_in_chunk, + .first_line_num = first_line_in_chunk, + .last_line_num = total_lines, + }; + checkpoints.push_back(std::move(checkpoint)); + + if (has_visitors) { + for (auto& v : visitors) { + co_await v.get().on_checkpoint(checkpoint_idx - 1); + } + } + + line_count_in_chunk = 0; + first_line_in_chunk = total_lines + 1; + } + co_return; + }; + + // Per-worker reorder buffer: moodycamel::ConcurrentQueue (backing our + // coro::Channel) does not guarantee strict FIFO without explicit + // producer tokens, so we re-sort by msg.seq here. Channel capacity is + // bounded so the buffer is also bounded (~channel capacity entries). + auto drain_visitors = [&]() -> dftracer::utils::coro::CoroTask { + for (auto& v : visitors) { + if (v.get().wants_drain()) { + co_await v.get().drain_pending(); + } + } + }; + + for (auto& chan : chans) { + std::uint64_t expected_seq = 0; + std::map pending; + while (auto msg_opt = co_await chan->receive()) { + auto& incoming = *msg_opt; + if (incoming.seq == expected_seq) { + co_await process_msg(incoming); + co_await drain_visitors(); + ++expected_seq; + auto it = pending.find(expected_seq); + while (it != pending.end()) { + co_await process_msg(it->second); + co_await drain_visitors(); + pending.erase(it); + ++expected_seq; + it = pending.find(expected_seq); + } + } else { + pending.emplace(incoming.seq, std::move(incoming)); + } + } + while (!pending.empty()) { + auto it = pending.begin(); + if (it->first != expected_seq) break; + co_await process_msg(it->second); + co_await drain_visitors(); + pending.erase(it); + ++expected_seq; + } + } + + if (has_visitors) { + for (auto& v : visitors) co_await v.get().flush(); + } + total_uc_size = global_uc; + tail_line_count = line_count_in_chunk; + co_return true; +} + +static dftracer::utils::coro::CoroTask process_chunks_parallel( + CoroScope* scope, int fd, std::uint64_t slice_c_end, + std::uint64_t file_size, std::vector members, + std::uint64_t ckpt_size, std::uint64_t checkpoint_idx_base, + bool strip_slice_leading_partial, std::uint64_t& total_lines, + std::uint64_t& total_uc_size, std::uint64_t& tail_line_count, + std::vector& checkpoints, + const Indexer::VisitorList& visitors) { + // Cap worker count at member count and a reasonable default. + constexpr std::size_t DEFAULT_MAX_WORKERS = 16; + constexpr std::size_t CHAN_CAP = 4; + const std::size_t num_workers = + std::min(DEFAULT_MAX_WORKERS, members.size()); + + std::vector> chans; + chans.reserve(num_workers); + for (std::size_t i = 0; i < num_workers; ++i) { + chans.push_back( + dftracer::utils::coro::make_channel(CHAN_CAP)); + } + + // Partition members contiguously, remainder spread over the first few + // workers so range counts differ by at most 1. + std::vector> ranges(num_workers); + { + const std::size_t per = members.size() / num_workers; + const std::size_t rem = members.size() % num_workers; + std::size_t cursor = 0; + for (std::size_t w = 0; w < num_workers; ++w) { + const std::size_t count = per + (w < rem ? 1 : 0); + ranges[w] = {cursor, cursor + count}; + cursor += count; + } + } + + bool dispatcher_ok = true; + std::shared_ptr> members_shared = + std::make_shared>(std::move(members)); + + co_await scope->scope([&](CoroScope& child) + -> dftracer::utils::coro::CoroTask { + for (std::size_t w = 0; w < num_workers; ++w) { + const auto [rs, re] = ranges[w]; + const std::uint64_t c_start = (*members_shared)[rs].c_offset; + const std::uint64_t c_end = (re < members_shared->size()) + ? (*members_shared)[re].c_offset + : slice_c_end; + auto producer = chans[w]->producer(); + // Only the very first worker of a mid-file slice needs + // to strip the leading partial line; subsequent workers + // see contiguous (whole-line-aligned) data from their + // predecessor's stream. + const bool strip_this = strip_slice_leading_partial && (w == 0); + // The LAST worker of a NON-LAST slice extends past + // `slice_c_end` to capture the line that straddles the + // slice boundary. If this slice's end is file end, the + // slice IS the last one -- no extension needed. + const bool extend_this = + (w + 1 == num_workers) && (slice_c_end < file_size); + child.spawn([fd, c_start, c_end, ckpt_size, strip_this, extend_this, + producer = std::move(producer)](CoroScope&) mutable + -> dftracer::utils::coro::CoroTask { + co_await parallel_worker(fd, c_start, c_end, ckpt_size, + strip_this, extend_this, + std::move(producer)); + }); + } + + child.spawn([&chans, checkpoint_idx_base, &total_lines, &total_uc_size, + &tail_line_count, &checkpoints, &visitors, &dispatcher_ok]( + CoroScope&) -> dftracer::utils::coro::CoroTask { + dispatcher_ok = co_await parallel_dispatcher( + chans, checkpoint_idx_base, total_lines, total_uc_size, + tail_line_count, checkpoints, visitors); + }); + + co_return; + }); + + co_return dispatcher_ok; +} + +static dftracer::utils::coro::CoroTask process_chunks( + CoroScope* scope, int fd, std::uint64_t ckpt_size, + const GzipMemberSlice* slice, std::uint64_t& total_lines, + std::uint64_t& total_uc_size, std::uint64_t& tail_line_count, + std::vector& checkpoints, + const Indexer::VisitorList& visitors) { + // Pre-scanned slice path: caller supplied the member map and a range. + // Used by the MPI/distributed indexer to split one file across ranks + // without re-scanning. `checkpoint_idx_base` disambiguates keys so + // multiple slices of the same file_id produce disjoint SST entries. + if (scope != nullptr && slice != nullptr && slice->members != nullptr && + slice->member_end > slice->member_begin) { + struct stat st; + if (::fstat(fd, &st) != 0) co_return false; + const std::uint64_t file_size = static_cast(st.st_size); + const auto& all = *slice->members; + const std::size_t mb = slice->member_begin; + const std::size_t me = slice->member_end; + if (me > all.size() || mb >= me) co_return false; + // Slice end: next-member offset if this isn't the last slice of + // the file, else EOF. Crucial: a non-last slice's workers must + // not inflate past this boundary into another slice's bytes. + const std::uint64_t slice_c_end = + (me < all.size()) ? all[me].c_offset : file_size; + std::vector sliced(all.begin() + mb, all.begin() + me); + const bool strip_leading = (mb > 0); + co_return co_await process_chunks_parallel( + scope, fd, slice_c_end, file_size, std::move(sliced), ckpt_size, + slice->checkpoint_idx_base, strip_leading, total_lines, + total_uc_size, tail_line_count, checkpoints, visitors); + } + + // Try to discover member boundaries so we can parallelise. The scan is + // zero-copy, sequential, and fast relative to inflate; for single-member + // files we fall through to the scan-then-resume path which captures + // internal deflate-block checkpoints to fan out anyway. + if (scope != nullptr) { + struct stat st; + if (::fstat(fd, &st) == 0 && st.st_size >= 18) { + std::vector members; + const bool scan_ok = co_await enumerate_gzip_member_candidates( + fd, static_cast(st.st_size), members); + const std::uint64_t sz = static_cast(st.st_size); + if (scan_ok && members.size() >= 2) { + co_return co_await process_chunks_parallel( + scope, fd, sz, sz, std::move(members), ckpt_size, + /*checkpoint_idx_base=*/0, + /*strip_slice_leading_partial=*/false, total_lines, + total_uc_size, tail_line_count, checkpoints, visitors); + } + } + } + + co_return co_await process_chunks_serial(fd, ckpt_size, total_lines, + total_uc_size, tail_line_count, + checkpoints, visitors); +} + +} // namespace + +dftracer::utils::coro::CoroTask> +build_gzip_index_artifacts(const std::string& gz_path, std::uint64_t ckpt_size, + const Indexer::VisitorList& visitors, + CoroScope* scope, const GzipMemberSlice* slice) { int fd = ::open(gz_path.c_str(), O_RDONLY); if (fd < 0) { - co_return false; + co_return std::nullopt; } if (!visitors.empty()) { @@ -174,35 +600,34 @@ static dftracer::utils::coro::CoroTask build_index( std::uint64_t tail_line_count = 0; std::vector checkpoints; - const bool success = - co_await process_chunks(fd, ckpt_size, total_lines, total_uc_size, - tail_line_count, checkpoints, visitors); + const bool success = co_await process_chunks( + scope, fd, ckpt_size, slice, total_lines, total_uc_size, + tail_line_count, checkpoints, visitors); ::close(fd); if (!success) { - co_return false; + co_return std::nullopt; } finalize_checkpoints(checkpoints, total_uc_size, total_lines, tail_line_count); - auto* db_ptr = &db; - auto* checkpoints_ptr = &checkpoints; - co_await rocks::run([db_ptr, file_id, ckpt_size, total_lines, total_uc_size, - checkpoints_ptr] { - internal::TransactionScope txn(*db_ptr); - for (const auto& checkpoint : *checkpoints_ptr) { - db_ptr->insert_checkpoint(file_id, checkpoint); - } - db_ptr->insert_file_metadata(file_id, ckpt_size, total_lines, - total_uc_size); - txn.commit(); - }); - - co_return true; + GzipBuildArtifacts artifacts; + artifacts.checkpoint_size = ckpt_size; + artifacts.total_lines = total_lines; + artifacts.total_uc_size = total_uc_size; + artifacts.checkpoints = std::move(checkpoints); + co_return artifacts; } -} // namespace +void persist_gzip_index_artifacts(IndexDatabaseWriterContext& db, int file_id, + const GzipBuildArtifacts& artifacts) { + for (const auto& checkpoint : artifacts.checkpoints) { + db.insert_checkpoint(file_id, checkpoint); + } + db.insert_file_metadata(file_id, artifacts.checkpoint_size, + artifacts.total_lines, artifacts.total_uc_size); +} GzipIndexer::GzipIndexer(const std::string& gz_path_, const std::string& idx_path_, std::uint64_t ckpt_size_, @@ -299,42 +724,36 @@ dftracer::utils::coro::CoroTask GzipIndexer::build_async() const { const std::uint64_t final_ckpt_size = determine_checkpoint_size(ckpt_size, gz_path); const std::string logical = gz_path_logical_path; - const auto* logical_ptr = &logical; - const int file_id = co_await rocks::run([db_ptr = &db, logical_ptr, hash] { - return db_ptr->get_or_create_file_info(*logical_ptr, hash); - }); + auto writer = db.begin_write(); + const int file_id = writer->get_or_create_file_info(logical, hash); + writer->commit(); - if (!(co_await build_index(db, file_id, gz_path, final_ckpt_size, - visitors_))) { + auto artifacts = co_await build_gzip_index_artifacts( + gz_path, final_ckpt_size, visitors_, nullptr); + if (!artifacts) { throw IndexerError(IndexerError::Type::BUILD_ERROR, "Failed to build index for " + gz_path); } + { + auto w = db.begin_write(); + persist_gzip_index_artifacts(*w, file_id, *artifacts); + w->commit(); + } + (void)mtime; (void)bytes; - struct CacheSnapshot { - std::uint64_t num_lines = 0; - std::uint64_t max_bytes = 0; - std::vector checkpoints; - }; - auto snapshot = co_await rocks::run([db_ptr = &db, file_id] { - CacheSnapshot cache; - cache.num_lines = db_ptr->get_num_lines(file_id); - cache.max_bytes = db_ptr->get_max_bytes(file_id); - cache.checkpoints = db_ptr->query_checkpoints(file_id); - return cache; - }); cached_is_valid = true; cached_file_id = file_id; cached_checkpoint_size = final_ckpt_size; cached_checkpoint_size_ready = true; - cached_num_lines = snapshot.num_lines; + cached_num_lines = db.get_num_lines(file_id); cached_num_lines_ready = true; - cached_max_bytes = snapshot.max_bytes; + cached_max_bytes = db.get_max_bytes(file_id); cached_max_bytes_ready = true; std::lock_guard lock(cached_checkpoints_mutex); - cached_checkpoints = std::move(snapshot.checkpoints); + cached_checkpoints = db.query_checkpoints(file_id); co_return; } diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h index 94c0f04b..21f73e82 100644 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h +++ b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h @@ -4,20 +4,64 @@ #include #include #include +#include #include +#include #include #include +#include #include #include #include #include #include +#include #include #include namespace dftracer::utils::utilities::indexer::internal::gzip { +struct GzipBuildArtifacts { + std::uint64_t checkpoint_size = 0; + std::uint64_t total_lines = 0; + std::uint64_t total_uc_size = 0; + std::vector checkpoints; +}; + +/// Optional slice of a multi-member gzip file. When set, the indexer +/// processes only members `[member_begin, member_end)` of the file +/// (byte range `[members[member_begin].c_offset, members[member_end-1] +/// .c_offset + members[member_end-1].c_size)`). Used for cross-rank +/// splitting of large files; uc_offsets/line numbers in emitted +/// checkpoints are slice-local and `checkpoint_idx` is offset by +/// `checkpoint_idx_base` so multiple ranks writing the same file_id +/// produce disjoint keys. +struct GzipMemberSlice { + const std::vector *members = nullptr; + std::size_t member_begin = 0; + std::size_t member_end = 0; // exclusive + std::uint64_t checkpoint_idx_base = 0; +}; + +/// Build gzip index artifacts (checkpoints, dispatched visitor events). +/// +/// When `scope` is non-null and the input is multi-member gzip (the +/// dftracer runtime format), the inflate pass is parallelised across the +/// scope's executor. On single-member files or when `scope` is null, +/// falls back to the serial inflate loop with identical semantics. +/// +/// When `slice` is non-null, only the specified member range is +/// processed. The caller is responsible for ensuring `slice->members` +/// outlives this coroutine. +coro::CoroTask> build_gzip_index_artifacts( + const std::string &gz_path, std::uint64_t ckpt_size, + const Indexer::VisitorList &visitors, CoroScope *scope = nullptr, + const GzipMemberSlice *slice = nullptr); + +void persist_gzip_index_artifacts(IndexDatabaseWriterContext &db, int file_id, + const GzipBuildArtifacts &artifacts); + class GzipIndexer : public Indexer { public: static constexpr std::uint64_t DEFAULT_CHECKPOINT_SIZE = diff --git a/src/dftracer/utils/utilities/indexer/internal/helpers.cpp b/src/dftracer/utils/utilities/indexer/internal/helpers.cpp index e16da3ee..bbd1dd4f 100644 --- a/src/dftracer/utils/utilities/indexer/internal/helpers.cpp +++ b/src/dftracer/utils/utilities/indexer/internal/helpers.cpp @@ -40,7 +40,6 @@ std::string normalize_index_root(std::string_view path) { time_t get_file_modification_time(const std::string &file_path) { #if defined(DFTRACER_UTILS_USE_STD_FS) - // Use std::filesystem when available and working auto ftime = fs::last_write_time(file_path); auto sctp = std::chrono::time_point_cast( diff --git a/src/dftracer/utils/utilities/indexer/internal/index_batch_writer.h b/src/dftracer/utils/utilities/indexer/internal/index_batch_writer.h new file mode 100644 index 00000000..8b36b804 --- /dev/null +++ b/src/dftracer/utils/utilities/indexer/internal/index_batch_writer.h @@ -0,0 +1,120 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_BATCH_WRITER_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_BATCH_WRITER_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer::internal { + +using composites::dft::visitors::BloomVisitor; +using composites::dft::visitors::HashTableVisitor; +using composites::dft::visitors::ManifestVisitor; + +struct ParsedIndexJob { + int file_id = 0; + std::string file_path; + gzip::GzipBuildArtifacts artifacts; + std::unique_ptr bloom_visitor; + std::unique_ptr hash_table_visitor; + std::unique_ptr manifest_visitor; + bool success = true; + std::string error_message; +}; + +struct BatchWriterMetrics { + std::atomic write_ns{0}; + std::atomic files_written{0}; + std::atomic batches_committed{0}; +}; + +/// Drain `channel`, group `ParsedIndexJob`s into batches of `batch_size`, +/// and commit each batch through a fresh `IndexBatchSink` produced by +/// `make_sink()`. The caller-provided `commit_sink(sink)` finalises the +/// batch: for RocksDB-backed sinks it calls `.commit()`; for SST-backed +/// sinks it flushes to disk and routes `Artifacts` to a registry. +/// +/// `MakeSink` must be invocable as `() -> std::unique_ptr` +/// (or any subclass thereof). `CommitSink` must be invocable as +/// `(IndexBatchSink&) -> void`. +template +inline coro::CoroTask index_batch_write_worker( + coro::Channel* channel, std::size_t batch_size, + BatchWriterMetrics* metrics, MakeSink make_sink, CommitSink commit_sink) { + std::vector batch; + batch.reserve(batch_size); + + auto flush = [&]() { + if (batch.empty()) return; + auto start = std::chrono::steady_clock::now(); + + auto sink_owned = make_sink(); + IndexBatchSink& sink = *sink_owned; + for (auto& job : batch) { + if (!job.success) continue; + try { + for (const auto& checkpoint : job.artifacts.checkpoints) { + sink.insert_checkpoint(job.file_id, checkpoint); + } + sink.insert_file_metadata( + job.file_id, job.artifacts.checkpoint_size, + job.artifacts.total_lines, job.artifacts.total_uc_size); + if (job.bloom_visitor) { + job.bloom_visitor->finalize_sink_only(sink, job.file_id); + } + if (job.hash_table_visitor) { + job.hash_table_visitor->finalize(sink, job.file_id); + } + if (job.manifest_visitor) { + job.manifest_visitor->finalize(sink, job.file_id); + } + } catch (const std::exception& e) { + job.success = false; + job.error_message = e.what(); + } + } + commit_sink(sink); + + auto end = std::chrono::steady_clock::now(); + if (metrics) { + metrics->write_ns.fetch_add( + static_cast( + std::chrono::duration_cast(end - + start) + .count()), + std::memory_order_relaxed); + std::size_t written = 0; + for (const auto& job : batch) { + if (job.success) ++written; + } + metrics->files_written.fetch_add(written, + std::memory_order_relaxed); + metrics->batches_committed.fetch_add(1, std::memory_order_relaxed); + } + batch.clear(); + }; + + while (auto item = co_await channel->receive()) { + batch.push_back(std::move(*item)); + if (batch.size() >= batch_size) { + flush(); + } + } + flush(); + co_return; +} + +} // namespace dftracer::utils::utilities::indexer::internal + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_BATCH_WRITER_H diff --git a/src/dftracer/utils/utilities/indexer/internal/index_encoding.cpp b/src/dftracer/utils/utilities/indexer/internal/index_encoding.cpp new file mode 100644 index 00000000..2b535767 --- /dev/null +++ b/src/dftracer/utils/utilities/indexer/internal/index_encoding.cpp @@ -0,0 +1,309 @@ +#include +#include +#include + +#include +#include + +namespace dftracer::utils::utilities::indexer::internal::encoding { + +namespace { +namespace rocks = dftracer::utils::rocksdb; +} // namespace + +std::string prefix_for_file(int file_id) { + return rocks::KeyCodec::encode_be32(static_cast(file_id)); +} + +std::string metadata_key(int file_id) { return prefix_for_file(file_id); } + +std::string checkpoint_key(int file_id, std::uint64_t uc_offset, + std::uint64_t checkpoint_idx) { + std::string key = prefix_for_file(file_id); + append_u64(key, uc_offset); + append_u64(key, checkpoint_idx); + return key; +} + +std::string manifest_event_key(int file_id, std::uint64_t checkpoint_idx, + std::string_view cat, std::string_view name) { + std::string key("E|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + append_u64(key, checkpoint_idx); + key.append(cat); + key.push_back('\0'); + key.append(name); + return key; +} + +std::string manifest_metadata_key(int file_id, std::uint64_t checkpoint_idx, + std::string_view meta_type) { + std::string key("M|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + append_u64(key, checkpoint_idx); + key.append(meta_type); + return key; +} + +std::string encode_metadata_record(std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size) { + std::string value; + append_u64(value, checkpoint_size); + append_u64(value, total_lines); + append_u64(value, total_uc_size); + return value; +} + +std::string encode_checkpoint_value(const IndexerCheckpoint& checkpoint) { + std::string value; + append_u64(value, checkpoint.uc_size); + append_u64(value, checkpoint.c_offset); + append_u64(value, checkpoint.c_size); + append_i64(value, checkpoint.bits); + append_blob(value, checkpoint.dict_compressed); + append_u64(value, checkpoint.num_lines); + append_u64(value, checkpoint.first_line_num); + append_u64(value, checkpoint.last_line_num); + return value; +} + +namespace { + +// Packs lines directly into `out` as the `blob` payload of append_blob's +// wire format: u32 byte-length followed by raw little-endian uint32s. +void append_line_numbers_blob(std::string& out, + std::span lines) { + const auto bytes = + static_cast(lines.size() * sizeof(std::uint32_t)); + rocks::KeyCodec::append_be32(out, bytes); + if (!lines.empty()) { + out.append(reinterpret_cast(lines.data()), bytes); + } +} + +} // namespace + +std::string encode_event_range_value(std::span lines) { + std::string value; + value.reserve(sizeof(std::uint64_t) + sizeof(std::uint32_t) + + lines.size() * sizeof(std::uint32_t)); + append_u64(value, lines.size()); + append_line_numbers_blob(value, lines); + return value; +} + +std::string encode_metadata_value(std::span lines) { + std::string value; + value.reserve(sizeof(std::uint32_t) + lines.size() * sizeof(std::uint32_t)); + append_line_numbers_blob(value, lines); + return value; +} + +std::string file_pids_key(int file_id) { + std::string key("P|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + return key; +} + +std::string make_dimension_key(int file_id, std::string_view dimension) { + std::string key("d|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + key.append(dimension); + return key; +} + +std::string chunk_bloom_key(int file_id, std::string_view dimension, + std::uint64_t checkpoint_idx) { + std::string key = prefix_for_file(file_id); + key.append(dimension); + key.push_back('\0'); + append_u64(key, checkpoint_idx); + return key; +} + +std::string file_bloom_key(int file_id, std::string_view dimension) { + std::string key = prefix_for_file(file_id); + key.append(dimension); + return key; +} + +std::string chunk_stats_key(int file_id, std::uint64_t checkpoint_idx) { + std::string key = prefix_for_file(file_id); + append_u64(key, checkpoint_idx); + return key; +} + +std::string file_scalar_stats_key(int file_id) { + return prefix_for_file(file_id); +} + +std::string file_category_counts_key(int file_id) { + return prefix_for_file(file_id); +} + +std::string file_pid_tid_counts_key(int file_id) { + return prefix_for_file(file_id); +} + +std::string file_name_counts_key(int file_id) { + return prefix_for_file(file_id); +} + +std::string chunk_dim_stats_key(int file_id, std::uint64_t checkpoint_idx, + std::string_view dimension) { + std::string key = prefix_for_file(file_id); + append_u64(key, checkpoint_idx); + key.append(dimension); + return key; +} + +std::string encode_bloom_value(std::span blob, + std::uint64_t num_entries) { + std::string value; + append_u64(value, num_entries); + value.append(reinterpret_cast(blob.data()), blob.size()); + return value; +} + +std::string encode_chunk_statistics_value( + const composites::dft::indexing::ChunkStatistics& stats) { + std::string value; + append_u64(value, stats.total_events); + append_u64(value, stats.min_timestamp_us); + append_u64(value, stats.max_timestamp_us); + append_i64(value, stats.duration_sum_us); + append_u64(value, stats.duration_min_us); + append_u64(value, stats.duration_max_us); + append_u64(value, stats.duration_count); + append_double(value, stats.duration_m2); + + auto duration_sketch = stats.duration_sketch.serialize(); + append_blob(value, duration_sketch); + + auto duration_histogram = stats.duration_histogram.to_json(); + append_string(value, duration_histogram); + + auto name_sketches = stats.serialize_name_duration_sketches(); + append_blob(value, name_sketches); + append_string(value, stats.name_duration_histograms_json()); + append_string(value, stats.name_duration_sums_json()); + append_string(value, stats.name_duration_sum_sqs_json()); + append_string(value, stats.name_category_json()); + + auto ts_hist = stats.timestamp_histogram.serialize(); + append_blob(value, ts_hist); + + return value; +} + +std::string encode_chunk_dimension_stats_value( + const composites::dft::indexing::ChunkDimensionStats& stats, + std::size_t value_counts_cap) { + std::string value; + append_u64(value, stats.distinct_count); + append_string(value, stats.min_value); + append_string(value, stats.max_value); + append_string(value, stats.value_type); + auto compressed = stats.compress_value_counts(value_counts_cap); + append_u8(value, compressed.has_value() ? 1 : 0); + if (compressed) { + append_blob(value, *compressed); + } + return value; +} + +std::string name_lookup_key(std::string_view name) { + std::string key("s|"); + key.append(name); + return key; +} + +std::string name_reverse_key(std::uint64_t name_id) { + std::string key("i|"); + append_u64(key, name_id); + return key; +} + +std::string name_file_posting_key(std::uint64_t name_id, int file_id) { + std::string key("n|"); + append_u64(key, name_id); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + return key; +} + +std::string name_file_owner_key(int file_id, std::uint64_t name_id) { + std::string key("o|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + append_u64(key, name_id); + return key; +} + +std::string name_file_owner_prefix(int file_id) { + std::string key("o|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + return key; +} + +std::string name_chunk_posting_key(std::uint64_t name_id, int file_id, + std::uint64_t checkpoint_idx) { + std::string key("n|"); + append_u64(key, name_id); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + append_u64(key, checkpoint_idx); + return key; +} + +std::string name_chunk_owner_key(int file_id, std::uint64_t name_id, + std::uint64_t checkpoint_idx) { + std::string key("o|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + append_u64(key, name_id); + append_u64(key, checkpoint_idx); + return key; +} + +std::string name_chunk_owner_prefix(int file_id) { + std::string key("o|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + return key; +} + +std::string hash_table_forward_key(std::uint8_t type, std::string_view hash) { + std::string key; + key.reserve(1 + hash.size()); + key.push_back(static_cast(type)); + key.append(hash); + return key; +} + +std::string hash_table_reverse_key(std::uint8_t type, std::string_view name) { + std::string key; + key.reserve(1 + name.size()); + key.push_back(static_cast(type + 4)); + key.append(name); + return key; +} + +std::string encode_file_pids_value( + const std::unordered_set& pids) { + std::vector sorted_pids(pids.begin(), pids.end()); + std::sort(sorted_pids.begin(), sorted_pids.end()); + + std::string value; + auto encode_varint = [&value](std::uint64_t v) { + while (v >= 0x80) { + value.push_back(static_cast(v | 0x80)); + v >>= 7; + } + value.push_back(static_cast(v)); + }; + + encode_varint(sorted_pids.size()); + for (auto pid : sorted_pids) { + encode_varint(pid); + } + return value; +} + +} // namespace dftracer::utils::utilities::indexer::internal::encoding diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp index 157553ed..3620c580 100644 --- a/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp +++ b/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp @@ -1,8 +1,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -18,7 +18,7 @@ namespace dftracer::utils::utilities::indexer::internal::tar { using dftracer::utils::utilities::indexer::IndexDatabase; -namespace rocks = dftracer::utils::rocksdb; +using dftracer::utils::utilities::indexer::IndexDatabaseWriterContext; namespace { @@ -37,8 +37,8 @@ std::string normalize_idx_path(const std::string& path) { } dftracer::utils::coro::CoroTask build_tar_index( - IndexDatabase& db, int file_id, const std::string& tar_gz_path, - std::uint64_t ckpt_size) { + IndexDatabaseWriterContext& writer, int file_id, + const std::string& tar_gz_path, std::uint64_t ckpt_size) { int fd = ::open(tar_gz_path.c_str(), O_RDONLY); if (fd < 0) { co_return false; @@ -73,8 +73,8 @@ dftracer::utils::coro::CoroTask build_tar_index( break; } - accumulated_data.insert(accumulated_data.end(), inflater.out_buffer, - inflater.out_buffer + result.bytes_read); + accumulated_data.insert(accumulated_data.end(), inflater.out_buffer(), + inflater.out_buffer() + result.bytes_read); current_uc_offset += result.bytes_read; total_lines += result.lines_found; } @@ -87,38 +87,29 @@ dftracer::utils::coro::CoroTask build_tar_index( total_uc_size = current_uc_offset; - auto* db_ptr = &db; - auto* tar_entries_ptr = &tar_entries; const std::string archive_name = fs::path(tar_gz_path).filename().string(); - const auto* archive_name_ptr = &archive_name; - co_await rocks::run([db_ptr, file_id, ckpt_size, total_lines, total_uc_size, - tar_entries_ptr, archive_name_ptr] { - internal::TransactionScope txn(*db_ptr); - std::uint64_t regular_files = 0; - for (const auto& entry : *tar_entries_ptr) { - if (!entry.is_regular_file()) { - continue; - } - - ++regular_files; - db_ptr->insert_tar_file( - file_id, IndexDatabase::TarFileRecord{ - .file_name = entry.name, - .file_size = entry.size, - .file_mtime = entry.mtime, - .typeflag = entry.typeflag, - .data_offset = entry.data_offset, - .uncompressed_offset = entry.uncompressed_offset, - }); + std::uint64_t regular_files = 0; + for (const auto& entry : tar_entries) { + if (!entry.is_regular_file()) { + continue; } - db_ptr->insert_file_metadata(file_id, ckpt_size, total_lines, - total_uc_size); - db_ptr->insert_tar_archive_metadata(file_id, *archive_name_ptr, - ckpt_size, total_lines, - total_uc_size, regular_files); - txn.commit(); - }); + ++regular_files; + writer.insert_tar_file( + file_id, IndexDatabaseWriterContext::TarFileRecord{ + .file_name = entry.name, + .file_size = entry.size, + .file_mtime = entry.mtime, + .typeflag = entry.typeflag, + .data_offset = entry.data_offset, + .uncompressed_offset = entry.uncompressed_offset, + }); + } + + writer.insert_file_metadata(file_id, ckpt_size, total_lines, total_uc_size); + writer.insert_tar_archive_metadata(file_id, archive_name, ckpt_size, + total_lines, total_uc_size, + regular_files); ::close(fd); co_return true; @@ -198,18 +189,18 @@ dftracer::utils::coro::CoroTask TarIndexer::build_async() const { } IndexDatabase db(index_path); + auto writer = db.begin_write(); const auto hash = calculate_file_hash(tar_gz_path); const std::string logical = tar_gz_path_logical_path; - const auto* logical_ptr = &logical; - const int file_id = co_await rocks::run([db_ptr = &db, logical_ptr, hash] { - return db_ptr->get_or_create_file_info(*logical_ptr, hash); - }); + const int file_id = writer->get_or_create_file_info(logical, hash); - if (!(co_await build_tar_index(db, file_id, tar_gz_path, ckpt_size))) { + if (!(co_await build_tar_index(*writer, file_id, tar_gz_path, ckpt_size))) { throw IndexerError(IndexerError::Type::BUILD_ERROR, "Failed to build TAR index for " + tar_gz_path); } + writer->commit(); + struct CacheSnapshot { std::uint64_t checkpoint_size = 0; std::uint64_t num_lines = 0; @@ -220,22 +211,18 @@ dftracer::utils::coro::CoroTask TarIndexer::build_async() const { }; const std::string fallback_archive_name = fs::path(tar_gz_path).filename().string(); - const auto* fallback_archive_name_ptr = &fallback_archive_name; - auto snapshot = - co_await rocks::run([db_ptr = &db, file_id, fallback_archive_name_ptr] { - CacheSnapshot cache; - cache.checkpoint_size = db_ptr->get_checkpoint_size(file_id); - cache.num_lines = db_ptr->get_num_lines(file_id); - cache.max_bytes = db_ptr->get_max_bytes(file_id); - if (auto metadata = db_ptr->query_tar_archive_metadata(file_id)) { - cache.num_files = metadata->total_files; - cache.archive_name = metadata->archive_name; - } else { - cache.archive_name = *fallback_archive_name_ptr; - } - cache.checkpoints = db_ptr->query_checkpoints(file_id); - return cache; - }); + + CacheSnapshot snapshot; + snapshot.checkpoint_size = db.get_checkpoint_size(file_id); + snapshot.num_lines = db.get_num_lines(file_id); + snapshot.max_bytes = db.get_max_bytes(file_id); + if (auto metadata = db.query_tar_archive_metadata(file_id)) { + snapshot.num_files = metadata->total_files; + snapshot.archive_name = metadata->archive_name; + } else { + snapshot.archive_name = fallback_archive_name; + } + snapshot.checkpoints = db.query_checkpoints(file_id); std::lock_guard lock(cache_mutex); cached_is_valid = true; @@ -508,7 +495,7 @@ bool TarIndexer::find_file(const std::string& file_name, IndexDatabase db( index_path, dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); - IndexDatabase::TarFileRecord record; + TarFileRecord record; if (!db.find_tar_file(archive_id, file_name, record)) { return false; } diff --git a/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h b/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h index b23a23e1..a3e56fdd 100644 --- a/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h +++ b/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h @@ -1,39 +1,10 @@ #ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TRANSACTION_SCOPE_H #define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TRANSACTION_SCOPE_H -namespace dftracer::utils::utilities::indexer::internal { - -template -class TransactionScope { - public: - explicit TransactionScope(Database& db) : db_(db) { - db_.begin_transaction(); - } - - TransactionScope(const TransactionScope&) = delete; - TransactionScope& operator=(const TransactionScope&) = delete; - - TransactionScope(TransactionScope&& other) noexcept - : db_(other.db_), committed_(other.committed_) { - other.committed_ = true; - } - - ~TransactionScope() { - if (!committed_) { - db_.rollback_transaction(); - } - } - - void commit() { - db_.commit_transaction(); - committed_ = true; - } - - private: - Database& db_; - bool committed_ = false; -}; - -} // namespace dftracer::utils::utilities::indexer::internal +// TransactionScope has been removed. +// IndexDatabase no longer has begin_transaction/commit_transaction. +// Use the IndexBatchSink API on IndexDatabaseWriterContext (obtained via +// IndexDatabase::begin_write()) or individual insert methods followed by +// commit(). #endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TRANSACTION_SCOPE_H diff --git a/src/dftracer/utils/utilities/indexer/provenance_database.cpp b/src/dftracer/utils/utilities/indexer/provenance_database.cpp index 4896a54e..947cb0a0 100644 --- a/src/dftracer/utils/utilities/indexer/provenance_database.cpp +++ b/src/dftracer/utils/utilities/indexer/provenance_database.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -11,8 +12,9 @@ namespace dftracer::utils::utilities::indexer { namespace rocks = dftracer::utils::rocksdb; +namespace cf = rocks::cf; -using internal::IndexerError; +using namespace internal; namespace { @@ -82,60 +84,23 @@ std::string group_key(int file_info_id, std::string_view name) { return key; } -std::string segment_key(int file_info_id, int source_idx, - int source_checkpoint) { +std::string segment_key(int file_info_id, int source_idx, int source_checkpoint, + int segment_seq) { std::string key("px|"); rocks::KeyCodec::append_be32(key, static_cast(file_info_id)); rocks::KeyCodec::append_be32(key, static_cast(source_idx)); rocks::KeyCodec::append_be32(key, static_cast(source_checkpoint)); + rocks::KeyCodec::append_be32(key, static_cast(segment_seq)); return key; } -void append_string(std::string& out, std::string_view value) { - rocks::KeyCodec::append_be32(out, static_cast(value.size())); - out.append(value.data(), value.size()); -} - -void append_u32(std::string& out, std::uint32_t value) { - rocks::KeyCodec::append_be32(out, value); -} - -class Cursor { - public: - explicit Cursor(std::string_view data) : data_(data) {} - - std::uint32_t u32() { - auto part = take(4); - return rocks::KeyCodec::decode_be32(part); - } - - std::string str() { - const auto len = static_cast(u32()); - auto bytes = take(len); - return std::string(bytes.data(), bytes.size()); - } - - private: - std::string_view take(std::size_t len) { - if (offset_ + len > data_.size()) { - throw std::runtime_error("Corrupt provenance payload"); - } - auto part = data_.substr(offset_, len); - offset_ += len; - return part; - } - - std::string_view data_; - std::size_t offset_ = 0; -}; - template void scan_prefix(const rocks::RocksDatabase& db, std::string_view prefix, Fn&& fn) { internal::scan_prefix_iterator( "Failed to scan provenance prefix", prefix, - [&] { return db.new_iterator("provenance"); }, std::forward(fn)); + [&] { return db.new_iterator(cf::PROVENANCE); }, std::forward(fn)); } } // namespace @@ -156,22 +121,23 @@ int ProvenanceDatabase::get_or_create_file_info(const std::string& path, std::uint64_t file_hash) { const auto key = file_key(path); std::string value; - auto status = db_->get(key, &value, "provenance"); + auto status = db_->get(key, &value, cf::PROVENANCE); if (status.ok()) { const auto id = decode_file_id(value); if (decode_hash(value) == file_hash) { return id; } const auto encoded = encode_file_record(id, file_hash); - status = txn_batch_ ? db_->put(*txn_batch_, "provenance", key, encoded) - : db_->put(key, encoded, "provenance"); + status = txn_batch_ + ? db_->put(*txn_batch_, cf::PROVENANCE, key, encoded) + : db_->put(key, encoded, cf::PROVENANCE); if (!status.ok()) { throw_db_error("Failed to update provenance file info", status); } status = txn_batch_ - ? db_->put(*txn_batch_, "provenance", file_reverse_key(id), - path) - : db_->put(file_reverse_key(id), path, "provenance"); + ? db_->put(*txn_batch_, cf::PROVENANCE, + file_reverse_key(id), path) + : db_->put(file_reverse_key(id), path, cf::PROVENANCE); if (!status.ok()) { throw_db_error("Failed to update provenance reverse file info", status); @@ -184,7 +150,7 @@ int ProvenanceDatabase::get_or_create_file_info(const std::string& path, std::uint32_t next_id = 1; std::string next_value; - status = db_->get(next_file_id_key(), &next_value, "provenance"); + status = db_->get(next_file_id_key(), &next_value, cf::PROVENANCE); if (status.ok()) { next_id = rocks::KeyCodec::decode_be32(next_value); } else if (!status.IsNotFound()) { @@ -195,26 +161,26 @@ int ProvenanceDatabase::get_or_create_file_info(const std::string& path, encode_file_record(static_cast(next_id), file_hash); const auto next_encoded = rocks::KeyCodec::encode_be32(next_id + 1); if (txn_batch_) { - status = db_->put(*txn_batch_, "provenance", key, encoded); + status = db_->put(*txn_batch_, cf::PROVENANCE, key, encoded); if (!status.ok()) throw_db_error("Failed to insert file info", status); - status = db_->put(*txn_batch_, "provenance", file_reverse_key(next_id), - path); + status = db_->put(*txn_batch_, cf::PROVENANCE, + file_reverse_key(next_id), path); if (!status.ok()) { throw_db_error("Failed to insert reverse file info", status); } - status = db_->put(*txn_batch_, "provenance", next_file_id_key(), + status = db_->put(*txn_batch_, cf::PROVENANCE, next_file_id_key(), next_encoded); if (!status.ok()) { throw_db_error("Failed to update next provenance file id", status); } } else { - status = db_->put(key, encoded, "provenance"); + status = db_->put(key, encoded, cf::PROVENANCE); if (!status.ok()) throw_db_error("Failed to insert file info", status); - status = db_->put(file_reverse_key(next_id), path, "provenance"); + status = db_->put(file_reverse_key(next_id), path, cf::PROVENANCE); if (!status.ok()) { throw_db_error("Failed to insert reverse file info", status); } - status = db_->put(next_file_id_key(), next_encoded, "provenance"); + status = db_->put(next_file_id_key(), next_encoded, cf::PROVENANCE); if (!status.ok()) { throw_db_error("Failed to update next provenance file id", status); } @@ -224,7 +190,7 @@ int ProvenanceDatabase::get_or_create_file_info(const std::string& path, int ProvenanceDatabase::get_file_info_id(const std::string& path) const { std::string value; - auto status = db_->get(file_key(path), &value, "provenance"); + auto status = db_->get(file_key(path), &value, cf::PROVENANCE); if (status.IsNotFound()) { return -1; } @@ -263,8 +229,8 @@ void ProvenanceDatabase::insert_info(int file_info_id, std::string_view key, std::string_view value) { const auto db_key = info_key(file_info_id, key); auto status = txn_batch_ - ? db_->put(*txn_batch_, "provenance", db_key, value) - : db_->put(db_key, value, "provenance"); + ? db_->put(*txn_batch_, cf::PROVENANCE, db_key, value) + : db_->put(db_key, value, cf::PROVENANCE); if (!status.ok()) { throw_db_error("Failed to insert provenance info", status); } @@ -279,10 +245,10 @@ void ProvenanceDatabase::insert_source(int file_info_id, int source_idx, append_u32(value, static_cast(num_checkpoints)); append_string(value, event_hash); auto status = txn_batch_ - ? db_->put(*txn_batch_, "provenance", + ? db_->put(*txn_batch_, cf::PROVENANCE, source_key(file_info_id, source_idx), value) : db_->put(source_key(file_info_id, source_idx), value, - "provenance"); + cf::PROVENANCE); if (!status.ok()) { throw_db_error("Failed to insert provenance source", status); } @@ -291,30 +257,27 @@ void ProvenanceDatabase::insert_source(int file_info_id, int source_idx, void ProvenanceDatabase::insert_group(int file_info_id, std::string_view name, std::string_view predicate) { const auto db_key = group_key(file_info_id, name); - auto status = txn_batch_ - ? db_->put(*txn_batch_, "provenance", db_key, - std::string(predicate)) - : db_->put(db_key, std::string(predicate), "provenance"); + auto status = + txn_batch_ ? db_->put(*txn_batch_, cf::PROVENANCE, db_key, + std::string(predicate)) + : db_->put(db_key, std::string(predicate), cf::PROVENANCE); if (!status.ok()) { throw_db_error("Failed to insert provenance group", status); } } void ProvenanceDatabase::insert_segment(int file_info_id, int source_idx, - int source_checkpoint, + int source_checkpoint, int segment_seq, int output_line_start, int output_line_end, int event_count) { std::string value; append_u32(value, static_cast(output_line_start)); append_u32(value, static_cast(output_line_end)); append_u32(value, static_cast(event_count)); - auto status = - txn_batch_ - ? db_->put(*txn_batch_, "provenance", - segment_key(file_info_id, source_idx, source_checkpoint), - value) - : db_->put(segment_key(file_info_id, source_idx, source_checkpoint), - value, "provenance"); + auto key = + segment_key(file_info_id, source_idx, source_checkpoint, segment_seq); + auto status = txn_batch_ ? db_->put(*txn_batch_, cf::PROVENANCE, key, value) + : db_->put(key, value, cf::PROVENANCE); if (!status.ok()) { throw_db_error("Failed to insert provenance segment", status); } @@ -391,7 +354,7 @@ ProvenanceDatabase::query_all_segments(int file_info_id) const { std::string ProvenanceDatabase::query_info(int file_info_id, std::string_view key) const { std::string value; - auto status = db_->get(info_key(file_info_id, key), &value, "provenance"); + auto status = db_->get(info_key(file_info_id, key), &value, cf::PROVENANCE); if (status.IsNotFound()) { return {}; } diff --git a/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp b/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp deleted file mode 100644 index 931d8188..00000000 --- a/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp +++ /dev/null @@ -1,240 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -using dftracer::utils::utilities::common::json::JsonValue; -using dftracer::utils::utilities::composites::dft::DFTracerEvent; -using dftracer::utils::utilities::composites::dft::indexing::BloomFilter; -namespace dftracer::utils::utilities::indexer { - -namespace { - -static const std::string DIM_NAME = "name"; -static const std::string DIM_CAT = "cat"; -static const std::string DIM_PID = "pid"; -static const std::string DIM_TID = "tid"; -static const std::string DIM_PID_TID = "pid_tid"; -static const std::string DIM_HHASH = "hhash"; -static const std::string DIM_FHASH = "fhash"; -static const std::string DIM_SHASH = "shash"; - -std::string json_value_to_string(const JsonValue& val) { - if (val.is_string()) return val.get(); - if (val.is_uint()) return std::to_string(val.get()); - if (val.is_int()) return std::to_string(val.get()); - if (val.is_number()) return std::to_string(val.get()); - if (val.is_bool()) return val.get() ? "true" : "false"; - return {}; -} - -} // namespace - -BloomVisitor::BloomVisitor(ChunkIndexerConfig config, - std::vector dimensions) - : config_(std::move(config)), dimensions_(std::move(dimensions)) {} - -void BloomVisitor::begin(std::size_t /*num_checkpoints*/) { chunks_.clear(); } - -void BloomVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {} - -void BloomVisitor::ensure_chunk(std::size_t checkpoint_idx) { - if (checkpoint_idx < chunks_.size()) return; - auto old_size = chunks_.size(); - chunks_.resize(checkpoint_idx + 1); - for (std::size_t i = old_size; i < chunks_.size(); ++i) { - auto& chunk = chunks_[i]; - for (const auto& dim : dimensions_) { - chunk.bloom_filters.emplace( - dim, BloomFilter(config_.expected_entries_per_chunk, - config_.false_positive_rate)); - } - for (const auto& dim : dimensions_) { - auto& ds = chunk.dimension_stats[dim]; - ds.dimension = dim; - if (dim == DIM_PID || dim == DIM_TID) { - ds.value_type = "uint"; - } else { - ds.value_type = "string"; - } - } - auto& pt = chunk.dimension_stats[DIM_PID_TID]; - pt.dimension = DIM_PID_TID; - pt.value_type = "string"; - } -} - -void BloomVisitor::on_line(std::string_view line, std::size_t checkpoint_idx) { - if (line.empty()) return; - ensure_chunk(checkpoint_idx); - - ChunkState& chunk = chunks_[checkpoint_idx]; - - if (!yy_alc_initialized_) { - yyjson_alc_pool_init(&yy_alc_, yy_buf_.data(), yy_buf_.size()); - yy_alc_initialized_ = true; - } - - yyjson_doc* doc = - yyjson_read_opts(const_cast(line.data()), line.size(), - YYJSON_READ_NOFLAG, &yy_alc_, nullptr); - if (!doc) return; - - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); - return; - } - - JsonValue json(root); - DFTracerEvent ev; - if (!DFTracerEvent::parse(json, ev)) { - yyjson_doc_free(doc); - return; - } - - if (ev.is_metadata()) { - if (ev.args.exists()) { - std::string hash_val = ev.args["value"].get(); - std::string resolved = ev.args["name"].get(); - - if (!hash_val.empty() && !resolved.empty()) { - if (ev.name == "HH") { - chunk.hash_resolutions[DIM_HHASH][hash_val] = resolved; - } else if (ev.name == "FH") { - chunk.hash_resolutions[DIM_FHASH][hash_val] = resolved; - } else if (ev.name == "SH") { - chunk.hash_resolutions[DIM_SHASH][hash_val] = resolved; - } - } - } - } else { - chunk.statistics.update_from_event(ev.name, ev.cat, ev.pid, ev.tid, - ev.ts, ev.dur); - - // Helper: add to bloom filter and observe dimension stats - auto observe = [&chunk](const std::string& dim, std::string_view val) { - if (val.empty()) return; - auto bf_it = chunk.bloom_filters.find(dim); - if (bf_it != chunk.bloom_filters.end()) { - bf_it->second.add(val); - } - auto ds_it = chunk.dimension_stats.find(dim); - if (ds_it != chunk.dimension_stats.end()) { - ds_it->second.observe(val); - } - }; - - observe(DIM_NAME, ev.name); - observe(DIM_CAT, ev.cat); - - char pid_buf[24], tid_buf[24], pt_buf[52]; - auto [pp, _1] = - std::to_chars(pid_buf, pid_buf + sizeof(pid_buf), ev.pid); - std::string_view pid_sv(pid_buf, pp - pid_buf); - auto [tp, _2] = - std::to_chars(tid_buf, tid_buf + sizeof(tid_buf), ev.tid); - std::string_view tid_sv(tid_buf, tp - tid_buf); - - observe(DIM_PID, pid_sv); - observe(DIM_TID, tid_sv); - - auto len = pp - pid_buf; - std::memcpy(pt_buf, pid_buf, len); - pt_buf[len] = ':'; - std::memcpy(pt_buf + len + 1, tid_buf, tp - tid_buf); - std::string_view pt_sv(pt_buf, len + 1 + (tp - tid_buf)); - observe(DIM_PID_TID, pt_sv); - - if (ev.args.exists()) { - std::string_view hhash = ev.args["hhash"].get(); - observe(DIM_HHASH, hhash); - - std::string_view fhash = ev.args["fhash"].get(); - observe(DIM_FHASH, fhash); - - std::string_view shash = - ev.args["cmd_hash"].get(); - if (shash.empty()) { - shash = ev.args["exec_hash"].get(); - } - observe(DIM_SHASH, shash); - - for (const auto& dim : config_.extra_dimensions) { - JsonValue val = ev.args.at(dim.c_str()); - if (val.exists()) { - std::string str_val = json_value_to_string(val); - observe(dim, str_val); - } - } - } - - chunk.events_processed++; - } - - yyjson_doc_free(doc); -} - -void BloomVisitor::finalize(IndexDatabase& db, int file_id) { - std::unordered_map file_blooms; - for (const auto& dim : dimensions_) { - file_blooms.emplace(dim, BloomFilter(config_.expected_entries_per_chunk, - config_.false_positive_rate)); - } - - std::vector blob; - - for (std::size_t i = 0; i < chunks_.size(); ++i) { - ChunkState& chunk = chunks_[i]; - auto checkpoint_idx = static_cast(i); - - for (const auto& dim : dimensions_) { - auto it = chunk.bloom_filters.find(dim); - if (it == chunk.bloom_filters.end()) continue; - - const BloomFilter& bf = it->second; - bf.serialize_into(blob); - db.insert_chunk_bloom_filter( - file_id, checkpoint_idx, dim, - std::span(blob.data(), blob.size()), - static_cast(bf.num_entries())); - - file_blooms.at(dim).merge_from(bf); - } - - db.insert_chunk_statistics(file_id, checkpoint_idx, chunk.statistics); - - for (const auto& [dim, ds] : chunk.dimension_stats) { - db.insert_chunk_dimension_stats(file_id, checkpoint_idx, ds, - config_.value_counts_cap); - } - - for (const auto& [dim, resolutions] : chunk.hash_resolutions) { - for (const auto& [hash_val, resolved] : resolutions) { - db.insert_hash_resolution(file_id, dim, hash_val, resolved); - } - } - } - - for (const auto& dim : dimensions_) { - const BloomFilter& bf = file_blooms.at(dim); - bf.serialize_into(blob); - db.insert_file_bloom_filter( - file_id, dim, - std::span(blob.data(), blob.size()), - static_cast(bf.num_entries())); - } - - for (const auto& dim : dimensions_) { - db.insert_index_dimension(file_id, dim); - } -} - -} // namespace dftracer::utils::utilities::indexer diff --git a/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp b/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp deleted file mode 100644 index ec388b51..00000000 --- a/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include -#include -#include -#include -#include - -using dftracer::utils::utilities::common::json::JsonValue; -namespace queries = - dftracer::utils::utilities::composites::dft::indexing::queries; - -namespace dftracer::utils::utilities::indexer { - -void ManifestVisitor::begin(std::size_t /*num_checkpoints*/) { - event_lines_.clear(); - metadata_lines_.clear(); - chunk_line_ = 0; -} - -void ManifestVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) { - chunk_line_ = 0; -} - -void ManifestVisitor::ensure_chunk(std::size_t checkpoint_idx) { - if (checkpoint_idx < event_lines_.size()) return; - event_lines_.resize(checkpoint_idx + 1); - metadata_lines_.resize(checkpoint_idx + 1); -} - -void ManifestVisitor::on_line(std::string_view line, - std::size_t checkpoint_idx) { - std::uint32_t ln = chunk_line_++; - - if (line.empty()) return; - ensure_chunk(checkpoint_idx); - - yyjson_doc* doc = yyjson_read(line.data(), line.size(), YYJSON_READ_NOFLAG); - if (!doc) return; - - yyjson_val* root = yyjson_doc_get_root(doc); - if (root && yyjson_is_obj(root)) { - JsonValue json(root); - std::string_view ph = json["ph"].get(); - - if (ph == "M") { - std::string name = json["name"].get(); - if (!name.empty()) { - metadata_lines_[checkpoint_idx][name].push_back(ln); - } - } else { - std::string cat = json["cat"].get(); - std::string name = json["name"].get(); - event_lines_[checkpoint_idx][{cat, name}].push_back(ln); - } - } - - yyjson_doc_free(doc); -} - -void ManifestVisitor::finalize(IndexDatabase& db, int file_id) { - for (std::size_t ci = 0; ci < event_lines_.size(); ++ci) { - for (auto& [key, lines] : event_lines_[ci]) { - db.insert_event_range(file_id, static_cast(ci), - key.first, key.second, lines); - } - - for (auto& [meta_type, lines] : metadata_lines_[ci]) { - db.insert_metadata_lines(file_id, static_cast(ci), - meta_type, lines); - } - } -} - -} // namespace dftracer::utils::utilities::indexer diff --git a/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp b/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp index 86449551..d60aecd8 100644 --- a/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp +++ b/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp @@ -368,6 +368,7 @@ std::unique_ptr GzipReader::stream(const StreamConfig &config) { std::size_t start = config.start(); std::size_t end = config.end(); std::size_t buffer_size = config.buffer_size(); + bool extend_to_line_boundary = config.extend_to_line_boundary(); // Convert line range to byte range if needed std::size_t start_bytes = start; @@ -472,6 +473,8 @@ std::unique_ptr GzipReader::stream(const StreamConfig &config) { // Single line-aligned bytes at a time auto line_byte_stream = std::make_unique(buffer_size); + line_byte_stream->set_extend_to_line_boundary( + extend_to_line_boundary); line_byte_stream->initialize(gz_path, start_bytes, end_bytes, *indexer); @@ -488,6 +491,8 @@ std::unique_ptr GzipReader::stream(const StreamConfig &config) { // Multiple line-aligned bytes per read auto line_byte_stream = std::make_unique(buffer_size); + line_byte_stream->set_extend_to_line_boundary( + extend_to_line_boundary); line_byte_stream->initialize(gz_path, start_bytes, end_bytes, *indexer); return line_byte_stream; @@ -496,6 +501,8 @@ std::unique_ptr GzipReader::stream(const StreamConfig &config) { // Single parsed line per read auto line_byte_stream = std::make_unique(buffer_size); + line_byte_stream->set_extend_to_line_boundary( + extend_to_line_boundary); line_byte_stream->initialize(gz_path, start_bytes, end_bytes, *indexer); @@ -511,6 +518,8 @@ std::unique_ptr GzipReader::stream(const StreamConfig &config) { // Multiple parsed lines per read auto line_byte_stream = std::make_unique(buffer_size); + line_byte_stream->set_extend_to_line_boundary( + extend_to_line_boundary); line_byte_stream->initialize(gz_path, start_bytes, end_bytes, *indexer); diff --git a/src/dftracer/utils/utilities/reader/internal/inflater.h b/src/dftracer/utils/utilities/reader/internal/inflater.h index 997fa1a6..94d08ca3 100644 --- a/src/dftracer/utils/utilities/reader/internal/inflater.h +++ b/src/dftracer/utils/utilities/reader/internal/inflater.h @@ -232,7 +232,7 @@ class ReaderInflater : public Inflater { * Check if the stream has reached the end */ bool is_at_end() const { - return stream.avail_in == 0 && stream.avail_out == sizeof(out_buffer); + return stream.avail_in == 0 && stream.avail_out == BUFFER_SIZE; } }; diff --git a/src/dftracer/utils/utilities/reader/internal/streams/gzip_line_byte_stream.h b/src/dftracer/utils/utilities/reader/internal/streams/gzip_line_byte_stream.h index 63ab0047..92f7df0f 100644 --- a/src/dftracer/utils/utilities/reader/internal/streams/gzip_line_byte_stream.h +++ b/src/dftracer/utils/utilities/reader/internal/streams/gzip_line_byte_stream.h @@ -22,6 +22,7 @@ class GzipLineByteStream : public GzipStream { std::vector partial_line_buffer_; std::size_t actual_start_bytes_; std::size_t bytes_returned_; // Track how many bytes we've returned to user + bool extend_to_line_boundary_ = false; // Buffer for zero-copy reads std::vector buffer_; @@ -39,6 +40,8 @@ class GzipLineByteStream : public GzipStream { partial_line_buffer_.reserve(1 * 1024 * 1024); } + void set_extend_to_line_boundary(bool v) { extend_to_line_boundary_ = v; } + void initialize(const std::string &gz_path, std::size_t start_bytes, std::size_t end_bytes, dftracer::utils::utilities::indexer::internal::Indexer @@ -145,10 +148,42 @@ class GzipLineByteStream : public GzipStream { } if (is_at_target_end()) { - DFTRACER_UTILS_LOG_DEBUG( - "GzipLineByteStream: at target end, current_position=%zu, " - "target_end_bytes=%zu", - current_position_, target_end_bytes_); + if (extend_to_line_boundary_ && !partial_line_buffer_.empty() && + current_position_ < max_file_bytes_) { + std::size_t partial_size = partial_line_buffer_.size(); + if (partial_size <= buffer_.size()) { + std::memcpy(buffer_.data(), partial_line_buffer_.data(), + partial_size); + std::size_t avail = buffer_.size() - partial_size; + std::size_t cap = static_cast( + max_file_bytes_ - current_position_); + std::size_t to_read = std::min(avail, cap); + std::size_t got = 0; + bool ok = co_await inflater_.read( + fd_, file_offset_, + reinterpret_cast(buffer_.data() + + partial_size), + to_read, got); + if (ok && got > 0) { + current_position_ += got; + std::size_t total = partial_size + got; + std::size_t emit = 0; + for (std::size_t i = partial_size; i < total; ++i) { + if (buffer_[i] == '\n') { + emit = i + 1; + break; + } + } + partial_line_buffer_.clear(); + is_finished_ = true; + if (emit > 0) { + bytes_returned_ += emit; + co_return emit; + } + co_return 0; + } + } + } is_finished_ = true; co_return 0; } diff --git a/src/dftracer/utils/utilities/reader/internal/streams/line_stream.h b/src/dftracer/utils/utilities/reader/internal/streams/line_stream.h index 3ddcdbc7..2b8612d6 100644 --- a/src/dftracer/utils/utilities/reader/internal/streams/line_stream.h +++ b/src/dftracer/utils/utilities/reader/internal/streams/line_stream.h @@ -12,14 +12,6 @@ namespace dftracer::utils::utilities::reader::internal { -/** - * @brief Stream that returns one single line at a time from a LINE_BYTES - * stream. - * - * Wraps a LINE_BYTES stream and provides single-line reading. - * Each call to read() returns exactly one complete line (with newline). - * Can optionally filter by line range when line numbers are specified. - */ class LineStream : public ReaderStream { private: std::unique_ptr underlying_stream_; @@ -35,6 +27,14 @@ class LineStream : public ReaderStream { std::size_t output_position_; std::size_t span_pos_; + enum class ParseResult { HAS_LINE, NEED_MORE_DATA, FINISHED }; + + struct DirectOutputResult { + std::size_t bytes_written; + bool need_more_data; + bool finished; + }; + public: explicit LineStream(std::unique_ptr underlying_stream, std::size_t start_line = 0, std::size_t end_line = 0, @@ -56,22 +56,37 @@ class LineStream : public ReaderStream { ~LineStream() override { reset(); } coro::CoroTask> read_async() override { - if (!underlying_stream_) { + if (!underlying_stream_ || is_finished_) { co_return {}; } - if (is_finished_) { - co_return {}; - } + while (true) { + auto result = try_parse_next_line(); - // Parse next line into current_line_ - if (!co_await parse_next_line()) { - co_return {}; - } + if (result == ParseResult::HAS_LINE) { + co_return std::span(current_line_.data(), + current_line_.size()); + } + if (result == ParseResult::FINISHED) { + co_return {}; + } - // Return view to current_line_ - co_return std::span(current_line_.data(), - current_line_.size()); + if (underlying_stream_->done()) { + if (handle_eof_line()) { + co_return std::span(current_line_.data(), + current_line_.size()); + } + is_finished_ = true; + co_return {}; + } + + current_span_ = co_await underlying_stream_->read_async(); + span_pos_ = 0; + if (current_span_.empty()) { + is_finished_ = true; + co_return {}; + } + } } coro::CoroTask read_async(char* buffer, @@ -80,7 +95,6 @@ class LineStream : public ReaderStream { co_return 0; } - // Handle any pending line from previous call if (has_pending_line_) { co_return output_pending_line(buffer, buffer_size); } @@ -89,19 +103,40 @@ class LineStream : public ReaderStream { co_return 0; } - // Try fast path: direct output from read_buffer_ to output buffer - std::size_t written = co_await try_direct_output(buffer, buffer_size); + while (true) { + auto direct = try_direct_output(buffer, buffer_size); + if (direct.bytes_written > 0) { + co_return direct.bytes_written; + } + if (direct.finished) { + co_return 0; + } - if (written > 0) { - co_return written; - } + if (!direct.need_more_data) { + auto result = try_parse_next_line(); + if (result == ParseResult::HAS_LINE) { + co_return output_pending_line(buffer, buffer_size); + } + if (result == ParseResult::FINISHED) { + co_return 0; + } + } - // Slow path: need to use intermediate storage - if (!co_await parse_next_line()) { - co_return 0; - } + if (underlying_stream_->done()) { + if (handle_eof_line()) { + co_return output_pending_line(buffer, buffer_size); + } + is_finished_ = true; + co_return 0; + } - co_return output_pending_line(buffer, buffer_size); + current_span_ = co_await underlying_stream_->read_async(); + span_pos_ = 0; + if (current_span_.empty()) { + is_finished_ = true; + co_return 0; + } + } } bool done() const override { return is_finished_ && !has_pending_line_; } @@ -121,10 +156,6 @@ class LineStream : public ReaderStream { } private: - // ======================================================================== - // Range Checking Helpers - // ======================================================================== - bool is_beyond_range() const { return end_line_ > 0 && current_line_number_ > end_line_; } @@ -133,32 +164,13 @@ class LineStream : public ReaderStream { if (start_line_ == 0 && end_line_ == 0) { return true; } - bool after_start = (start_line_ == 0 || current_line_number_ >= start_line_); bool before_end = (end_line_ == 0 || current_line_number_ <= end_line_); - return after_start && before_end; } - // ======================================================================== - // Buffer Management - // ======================================================================== - - coro::CoroTask refill_span_if_needed() { - if (span_pos_ < current_span_.size()) { - co_return true; - } - - if (underlying_stream_->done()) { - co_return false; - } - - // Get new span from underlying stream (zero-copy) - current_span_ = co_await underlying_stream_->read_async(); - span_pos_ = 0; - co_return !current_span_.empty(); - } + bool has_data_in_span() const { return span_pos_ < current_span_.size(); } const char* find_next_newline() const { return static_cast( @@ -172,80 +184,91 @@ class LineStream : public ReaderStream { span_pos_ = current_span_.size(); } - // ======================================================================== - // Fast Path: Direct Output (No Intermediate Storage) - // ======================================================================== - - /** - * @brief Attempt to write a line directly from read_buffer_ to output - * buffer. - * - * This fast path avoids intermediate string copies when: - * - No accumulated data exists - * - A complete line fits in the output buffer - * - * Uses a loop to skip filtered lines efficiently. - * - * @return Number of bytes written, or 0 if fast path unavailable - */ - coro::CoroTask try_direct_output(char* buffer, - std::size_t buffer_size) { - // Fast path requires no accumulated data + ParseResult try_parse_next_line() { + if (is_beyond_range()) { + is_finished_ = true; + return ParseResult::FINISHED; + } + + while (has_data_in_span()) { + const char* newline_ptr = find_next_newline(); + + if (!newline_ptr) { + accumulate_remaining_span(); + return ParseResult::NEED_MORE_DATA; + } + + std::size_t newline_pos = newline_ptr - current_span_.data(); + if (process_complete_line(newline_pos)) { + return ParseResult::HAS_LINE; + } + } + + return ParseResult::NEED_MORE_DATA; + } + + DirectOutputResult try_direct_output(char* buffer, + std::size_t buffer_size) { if (!line_accumulator_.empty()) { - co_return 0; + return {0, false, false}; } - // Loop to skip filtered lines efficiently while (true) { if (is_beyond_range()) { is_finished_ = true; - co_return 0; + return {0, false, true}; } - if (!co_await refill_span_if_needed()) { - co_return 0; + if (!has_data_in_span()) { + return {0, true, false}; } const char* newline_ptr = find_next_newline(); if (!newline_ptr) { - // No complete line available, must use slow path - co_return 0; + return {0, false, false}; } std::size_t newline_pos = newline_ptr - current_span_.data(); std::size_t line_length = newline_pos - span_pos_ + 1; - // Line must fit in output buffer for fast path if (line_length > buffer_size) { - co_return 0; + return {0, false, false}; } bool should_output = should_output_current_line(); if (is_beyond_range()) { is_finished_ = true; - co_return 0; + return {0, false, true}; } current_line_number_++; if (should_output) { - // Direct copy: span -> output buffer (zero-copy from underlying - // stream!) std::memcpy(buffer, current_span_.data() + span_pos_, line_length); span_pos_ = newline_pos + 1; - co_return line_length; + return {line_length, false, false}; } - // Line filtered out, skip and continue to next span_pos_ = newline_pos + 1; } } - // ======================================================================== - // Slow Path: Parse and Store Line - // ======================================================================== + bool handle_eof_line() { + if (line_accumulator_.empty()) { + return false; + } + current_line_ = std::move(line_accumulator_); + line_accumulator_.clear(); + if (should_output_current_line() && !is_beyond_range()) { + has_pending_line_ = true; + output_position_ = 0; + current_line_number_++; + return true; + } + return false; + } void finalize_line_with_accumulator(std::size_t line_length) { line_accumulator_.append(current_span_.data() + span_pos_, line_length); @@ -262,7 +285,6 @@ class LineStream : public ReaderStream { bool process_complete_line(std::size_t newline_pos) { std::size_t line_length = newline_pos - span_pos_; - // Build complete line with or without accumulated data if (!line_accumulator_.empty()) { finalize_line_with_accumulator(line_length); } else { @@ -286,60 +308,10 @@ class LineStream : public ReaderStream { return true; } - // Line filtered out, continue parsing current_line_.clear(); return false; } - coro::CoroTask parse_next_line() { - if (is_beyond_range()) { - is_finished_ = true; - co_return false; - } - - while (true) { - if (!co_await refill_span_if_needed()) { - break; - } - - // Process all complete lines in current span - while (span_pos_ < current_span_.size()) { - const char* newline_ptr = find_next_newline(); - - if (!newline_ptr) { - accumulate_remaining_span(); - break; - } - - std::size_t newline_pos = newline_ptr - current_span_.data(); - - if (process_complete_line(newline_pos)) { - co_return true; - } - } - } - - // Handle final line at EOF without trailing newline - if (underlying_stream_->done() && !line_accumulator_.empty()) { - current_line_ = std::move(line_accumulator_); - line_accumulator_.clear(); - - if (should_output_current_line() && !is_beyond_range()) { - has_pending_line_ = true; - output_position_ = 0; - current_line_number_++; - co_return true; - } - } - - is_finished_ = true; - co_return false; - } - - // ======================================================================== - // Output Helpers - // ======================================================================== - std::size_t output_pending_line(char* buffer, std::size_t buffer_size) { if (output_position_ >= current_line_.size()) { has_pending_line_ = false; diff --git a/src/dftracer/utils/utilities/reader/trace_reader.cpp b/src/dftracer/utils/utilities/reader/trace_reader.cpp index d3ddef20..858581bf 100644 --- a/src/dftracer/utils/utilities/reader/trace_reader.cpp +++ b/src/dftracer/utils/utilities/reader/trace_reader.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -7,6 +8,8 @@ #include #include #include +#include +#include #include #include #include @@ -14,11 +17,17 @@ #include #include #include -#include +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW +#include +#endif +#include #include #include #include +#include +#include namespace dftracer::utils::utilities::reader { @@ -31,17 +40,675 @@ using indexer::internal::IndexerFactory; namespace { +thread_local simdjson::dom::parser tl_parser; + bool line_matches_query(const Query& q, std::string_view content) { - yyjson_doc* doc = yyjson_read(content.data(), content.size(), 0); - if (!doc) return false; - yyjson_val* root = yyjson_doc_get_root(doc); - bool result = false; - if (root && yyjson_is_obj(root)) { - JsonValue json(root); - result = q.evaluate(json); + auto result = tl_parser.parse(content.data(), content.size()); + if (result.error()) return false; + auto root = result.value_unsafe(); + if (!root.is_object()) return false; + JsonValue json(root); + return q.evaluate(json); +} + +struct LineRange { + std::size_t start_line; + std::size_t end_line; +}; + +// Cheap byte-level pre-filter derived from a query AST. +// +// The filter holds a list of literal substrings that MUST appear (verbatim) in +// any line matching the query. Currently populated only for ASTs of the form +// "AND of field == literal"; the common shape of dftindex equality queries. +// For unsupported shapes (range ops, OR, NOT, IN/NOT IN, non-equality compares) +// `required` is left empty and `may_match` trivially returns true. +// +// Semantically false-positive-safe: any line we accept still gets re-checked +// against the real query downstream. Lines we reject are guaranteed not to +// match because the literal representation of the comparison is missing. +struct LinePrefilter { + std::vector required; + + bool empty() const { return required.empty(); } + + bool may_match(std::string_view bytes) const { + for (const auto& lit : required) { + if (::memmem(bytes.data(), bytes.size(), lit.data(), lit.size()) == + nullptr) + return false; + } + return true; + } +}; + +bool collect_and_eq_literals(const common::query::QueryNode& node, + std::vector& out) { + return std::visit( + [&out](const auto& n) -> bool { + using T = std::decay_t; + if constexpr (std::is_same_v) { + if (n.op != common::query::CompareOp::EQ) return false; + std::string lit; + lit.reserve(n.field.path.size() + 16); + lit += '"'; + lit += n.field.path; + lit += "\":"; + const auto& val = n.value.value; + if (std::holds_alternative(val)) { + lit += '"'; + lit += std::get(val); + lit += '"'; + } else if (std::holds_alternative(val)) { + lit += std::to_string(std::get(val)); + } else if (std::holds_alternative(val)) { + lit += std::to_string(std::get(val)); + } else if (std::holds_alternative(val)) { + lit += std::get(val) ? "true" : "false"; + } else { + return false; // double or other: skip pre-filter + } + out.push_back(std::move(lit)); + return true; + } else if constexpr (std::is_same_v) { + return collect_and_eq_literals(*n.left, out) && + collect_and_eq_literals(*n.right, out); + } + return false; // OrNode, NotNode, InNode, NotInNode, CompareNode + // with non-EQ op: conservative skip + }, + node.data); +} + +// Strip a leading `[` and trailing `]` (plus surrounding whitespace) from a +// chunk buffer. These bookends appear in `.pfw.gz` files to keep them +// Perfetto-viewable as JSON arrays, but break simdjson iterate_many which +// expects whitespace-separated NDJSON. Safe to call on any chunk: if the +// bookends are absent the range is returned unchanged. +std::string_view strip_ndjson_bookends(std::string_view bytes) { + const char* s = bytes.data(); + const char* e = bytes.data() + bytes.size(); + auto is_ws = [](char c) { + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; + }; + while (s < e && is_ws(*s)) ++s; + if (s < e && *s == '[') { + ++s; + while (s < e && is_ws(*s)) ++s; + } + while (e > s && is_ws(e[-1])) --e; + if (e > s && e[-1] == ']') { + --e; + while (e > s && is_ws(e[-1])) --e; + } + return std::string_view(s, static_cast(e - s)); +} + +// AND-of-EQ predicates with concrete typed literals can be evaluated +// directly against simdjson without going through ValueMap (which costs +// wyhash + per-field std::string allocation per row). Anything more +// complex (OR/NOT/IN/range) falls back to the generic visitor. +struct CompiledEqProbe { + std::string top_key; // "pid", "args", "name", etc. + std::string nested_key; // "" for top-level, else e.g. "fhash" + enum class Kind { String, Int64, UInt64, Double, Bool }; + Kind kind = Kind::String; + std::string s_val; + std::int64_t i64_val = 0; + std::uint64_t u64_val = 0; + double d_val = 0.0; + bool b_val = false; +}; + +// Top-level JSON keys in dftracer events. Anything else in the query DSL +// (e.g. `epoch == 0`, `fhash == "..."`) refers to a field nested under +// "args"; the same convention collect_query_fields relies on when it +// folds nested object keys into the flat ValueMap. +bool is_top_level_event_key(std::string_view k) { + return k == "id" || k == "name" || k == "cat" || k == "pid" || k == "tid" || + k == "ts" || k == "dur" || k == "ph"; +} + +// Walk a CompareNode-with-EQ leaf into a probe. Returns false on +// unsupported shapes (more than one '.' or a literal type the simdjson +// get_X path can't compare directly). +bool compile_eq_leaf(const common::query::CompareNode& n, + CompiledEqProbe& out) { + if (n.op != common::query::CompareOp::EQ) return false; + auto dot = n.field.path.find('.'); + if (dot == std::string::npos) { + if (is_top_level_event_key(n.field.path)) { + out.top_key = n.field.path; + out.nested_key.clear(); + } else { + // Bare arg-style key: foo -> args.foo. + out.top_key = "args"; + out.nested_key = n.field.path; + } + } else { + if (n.field.path.find('.', dot + 1) != std::string::npos) return false; + out.top_key = n.field.path.substr(0, dot); + out.nested_key = n.field.path.substr(dot + 1); + } + return std::visit( + [&out](auto&& v) -> bool { + using T = std::decay_t; + if constexpr (std::is_same_v) { + out.kind = CompiledEqProbe::Kind::String; + out.s_val = v; + return true; + } else if constexpr (std::is_same_v) { + out.kind = CompiledEqProbe::Kind::Int64; + out.i64_val = v; + return true; + } else if constexpr (std::is_same_v) { + out.kind = CompiledEqProbe::Kind::UInt64; + out.u64_val = v; + return true; + } else if constexpr (std::is_same_v) { + out.kind = CompiledEqProbe::Kind::Double; + out.d_val = v; + return true; + } else if constexpr (std::is_same_v) { + out.kind = CompiledEqProbe::Kind::Bool; + out.b_val = v; + return true; + } else { + return false; + } + }, + n.value.value); +} + +// Try to compile the query AST as an AND of EQ leaves. nullopt on +// unsupported shapes; the ValueMap path handles those. +std::optional> try_compile_eq_probes( + const common::query::QueryNode& node) { + using namespace common::query; + return std::visit( + [&](const auto& n) -> std::optional> { + using T = std::decay_t; + if constexpr (std::is_same_v) { + CompiledEqProbe p; + if (!compile_eq_leaf(n, p)) return std::nullopt; + return std::vector{std::move(p)}; + } else if constexpr (std::is_same_v) { + auto l = try_compile_eq_probes(*n.left); + if (!l) return std::nullopt; + auto r = try_compile_eq_probes(*n.right); + if (!r) return std::nullopt; + l->insert(l->end(), std::make_move_iterator(r->begin()), + std::make_move_iterator(r->end())); + return l; + } else { + return std::nullopt; + } + }, + node.data); +} + +bool probe_matches_value(const CompiledEqProbe& p, + simdjson::ondemand::value val) { + switch (p.kind) { + case CompiledEqProbe::Kind::String: { + auto r = val.get_string(); + if (r.error()) return false; + auto sv = r.value_unsafe(); + return sv.size() == p.s_val.size() && + std::memcmp(sv.data(), p.s_val.data(), sv.size()) == 0; + } + case CompiledEqProbe::Kind::Int64: { + auto t = val.type(); + if (t.error()) return false; + if (t.value_unsafe() == simdjson::ondemand::json_type::number) { + auto num = val.get_number(); + if (num.error()) return false; + auto n = num.value_unsafe(); + if (n.is_int64()) return n.get_int64() == p.i64_val; + if (n.is_uint64()) { + if (p.i64_val < 0) return false; + return n.get_uint64() == + static_cast(p.i64_val); + } + return n.get_double() == static_cast(p.i64_val); + } + return false; + } + case CompiledEqProbe::Kind::UInt64: { + auto num = val.get_number(); + if (num.error()) return false; + auto n = num.value_unsafe(); + if (n.is_uint64()) return n.get_uint64() == p.u64_val; + if (n.is_int64()) { + auto v = n.get_int64(); + if (v < 0) return false; + return static_cast(v) == p.u64_val; + } + return n.get_double() == static_cast(p.u64_val); + } + case CompiledEqProbe::Kind::Double: { + auto r = val.get_double(); + if (r.error()) return false; + return r.value_unsafe() == p.d_val; + } + case CompiledEqProbe::Kind::Bool: { + auto r = val.get_bool(); + if (r.error()) return false; + return r.value_unsafe() == p.b_val; + } + } + return false; +} + +// Evaluate compiled AND-of-EQ probes by directly probing simdjson fields. +bool eval_compiled_eq(const std::vector& probes, + simdjson::ondemand::document_reference doc) { + for (const auto& p : probes) { + doc.rewind(); + auto top_r = doc.find_field_unordered( + std::string_view(p.top_key.data(), p.top_key.size())); + if (top_r.error()) return false; + auto top_v = top_r.value(); + if (p.nested_key.empty()) { + if (!probe_matches_value(p, top_v)) return false; + } else { + auto obj_r = top_v.get_object(); + if (obj_r.error()) return false; + auto inner_r = obj_r.value().find_field_unordered( + std::string_view(p.nested_key.data(), p.nested_key.size())); + if (inner_r.error()) return false; + if (!probe_matches_value(p, inner_r.value())) return false; + } + } + return true; +} + +LinePrefilter build_prefilter(const Query& q) { + // Short literals like `"pid":1000` or `"epoch":0` are common enough in + // practice that memmem on every line costs more than it saves on the + // parse side. Only keep literals long enough that rarity is plausible + // (hashes, filenames, host names). + constexpr std::size_t MIN_LITERAL_LEN = 16; + + LinePrefilter pf; + std::vector tmp; + if (collect_and_eq_literals(q.root(), tmp)) { + for (auto& lit : tmp) { + if (lit.size() >= MIN_LITERAL_LEN) { + pf.required.push_back(std::move(lit)); + } + } + } + return pf; +} + +coro::AsyncGenerator yield_lines_from_stream( + std::unique_ptr stream, std::size_t start_line_num, + const Query* query, bool chunk_prune_only = false, + const LinePrefilter* prefilter = nullptr) { + std::size_t line_num = start_line_num; + while (!stream->done()) { + auto chunk = co_await stream->read_async(); + if (chunk.empty()) break; + const char* data = chunk.data(); + std::size_t len = chunk.size(); + + // Chunk-level pre-filter: if any required literal is absent from this + // entire buffer, no line within it can match. Skip without splitting. + // Line numbers must stay correct for subsequent chunks. + if (prefilter && !prefilter->empty() && + !prefilter->may_match(std::string_view(data, len))) { + line_num += std::count(data, data + len, '\n'); + continue; + } + + std::size_t pos = 0; + while (pos < len) { + const void* nl_ptr = std::memchr(data + pos, '\n', len - pos); + std::size_t end_pos = + nl_ptr ? static_cast(nl_ptr) - data : len; + if (end_pos > pos) { + auto line_sv = std::string_view(data + pos, end_pos - pos); + bool accept = chunk_prune_only || !query || + line_matches_query(*query, line_sv); + if (accept && prefilter && !prefilter->empty() && + !prefilter->may_match(line_sv)) { + accept = false; + } + if (accept) { + co_yield Line(line_sv, line_num); + } + ++line_num; + } else { + ++line_num; + } + pos = end_pos + 1; + } + } +} + +coro::AsyncGenerator yield_lines_from_ranges( + std::shared_ptr reader, std::vector ranges, + std::size_t buffer_size, Query query, bool chunk_prune_only = false, + LinePrefilter prefilter = {}) { + for (const auto& range : ranges) { + auto stream = + reader->stream(internal::StreamConfig() + .stream_type(internal::StreamType::MULTI_LINES) + .range_type(internal::RangeType::LINE_RANGE) + .from(range.start_line) + .to(range.end_line) + .buffer_size(buffer_size)); + auto gen = + yield_lines_from_stream(std::move(stream), range.start_line, &query, + chunk_prune_only, &prefilter); + while (auto line = co_await gen.next()) { + co_yield *line; + } + } +} + +// Raw-chunk variants of the yield/read helpers. Same pruning logic as the +// line-yielding flavors but emit std::span buffers untouched +// (multi-line boundary respected by stream type). Used by read_json to run +// simdjson iterate_many over each chunk instead of parsing line by line. +coro::AsyncGenerator> yield_chunks_from_stream( + std::unique_ptr stream, + const LinePrefilter* prefilter = nullptr) { + while (!stream->done()) { + auto chunk = co_await stream->read_async(); + if (chunk.empty()) break; + if (prefilter && !prefilter->empty() && + !prefilter->may_match( + std::string_view(chunk.data(), chunk.size()))) { + continue; + } + co_yield chunk; + } +} + +coro::AsyncGenerator> yield_chunks_from_ranges( + std::shared_ptr reader, std::vector ranges, + std::size_t buffer_size, LinePrefilter prefilter = {}) { + for (const auto& range : ranges) { + auto stream = + reader->stream(internal::StreamConfig() + .stream_type(internal::StreamType::MULTI_LINES) + .range_type(internal::RangeType::LINE_RANGE) + .from(range.start_line) + .to(range.end_line) + .buffer_size(buffer_size)); + auto gen = yield_chunks_from_stream(std::move(stream), &prefilter); + while (auto chunk = co_await gen.next()) { + co_yield *chunk; + } + } +} + +coro::AsyncGenerator> read_chunks_indexed( + std::shared_ptr reader, std::string index_path, + std::string file_path, ReadConfig config, std::optional query, + bool extend_to_line_boundary = false) { + // Keep RocksDB alive for the generator's lifetime so per-method opens + // in GzipIndexer reuse DBManager's cached handle. + std::optional db_keep_alive; + if (!index_path.empty()) { + try { + db_keep_alive.emplace(index_path, + rocksdb::RocksDatabase::OpenMode::ReadOnly); + } catch (...) { + } + } + + LinePrefilter prefilter = query ? build_prefilter(*query) : LinePrefilter{}; + auto range_type = config.has_line_range() ? internal::RangeType::LINE_RANGE + : internal::RangeType::BYTE_RANGE; + std::size_t start = + config.has_line_range() ? config.start_line : config.start_byte; + std::size_t end = + config.has_line_range() ? config.end_line : config.end_byte; + + if (range_type == internal::RangeType::LINE_RANGE) { + auto total_lines = reader->get_num_lines(); + if (start == 0) start = 1; + if (end == 0 || end > total_lines) end = total_lines; + if (start > total_lines) co_return; + } else { + auto max_bytes = reader->get_max_bytes(); + if (end == 0 || end > max_bytes) end = max_bytes; + if (start >= max_bytes) co_return; + } + + if (query && !index_path.empty() && !config.skip_pruning) { + ChunkPrunerInput pruner_input{index_path, file_path, *query, nullptr}; + ChunkPrunerUtility pruner; + auto pruner_out = co_await pruner.process(pruner_input); + if (pruner_out.success && !pruner_out.file_may_match) { + co_return; + } + + if (pruner_out.success && !pruner_out.candidate_checkpoints.empty() && + pruner_out.candidate_checkpoints.size() < + pruner_out.total_checkpoints) { + indexer::IndexDatabase idx_db( + index_path, rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto logical = indexer::internal::get_logical_path(file_path); + int fid = idx_db.get_file_info_id(logical); + + if (fid >= 0) { + auto all_ckpts = idx_db.query_checkpoints(fid); + std::unordered_map + ckpt_map; + for (auto& ckpt : all_ckpts) { + ckpt_map.emplace(ckpt.checkpoint_idx, std::move(ckpt)); + } + + std::vector ranges; + std::uint64_t prev_idx = UINT64_MAX; + for (auto ckpt_idx : pruner_out.candidate_checkpoints) { + auto it = ckpt_map.find(ckpt_idx); + if (it == ckpt_map.end()) continue; + const auto& ckpt = it->second; + // Intersect with the caller's window (byte or line) so + // checkpoint-level parallel work items stay disjoint. + if (range_type == internal::RangeType::BYTE_RANGE) { + std::size_t ckpt_start = ckpt.uc_offset; + std::size_t ckpt_end = ckpt.uc_offset + ckpt.uc_size; + if (ckpt_end <= start) continue; + if (ckpt_start >= end) continue; + } else { + if (ckpt.last_line_num < start) continue; + if (ckpt.first_line_num > end) continue; + } + if (ranges.empty() || ckpt_idx != prev_idx + 1) { + ranges.push_back( + {ckpt.first_line_num, ckpt.last_line_num}); + } else { + ranges.back().end_line = ckpt.last_line_num; + } + prev_idx = ckpt_idx; + } + + if (ranges.empty()) { + co_return; + } + + auto gen = yield_chunks_from_ranges( + reader, std::move(ranges), config.buffer_size, prefilter); + while (auto chunk = co_await gen.next()) { + co_yield *chunk; + } + co_return; + } + } + } + + auto stream_type = (range_type == internal::RangeType::BYTE_RANGE) + ? internal::StreamType::MULTI_LINES_BYTES + : internal::StreamType::MULTI_LINES; + auto stream = + reader->stream(internal::StreamConfig() + .stream_type(stream_type) + .range_type(range_type) + .from(start) + .to(end) + .buffer_size(config.buffer_size) + .extend_to_line_boundary( + extend_to_line_boundary && + range_type == internal::RangeType::BYTE_RANGE)); + + auto gen = yield_chunks_from_stream(std::move(stream), &prefilter); + while (auto chunk = co_await gen.next()) { + co_yield *chunk; + } +} + +coro::AsyncGenerator read_lines_indexed( + std::shared_ptr reader, std::string index_path, + std::string file_path, ReadConfig config, std::optional query, + bool chunk_prune_only = false) { + // Keep RocksDB alive for the generator's lifetime so per-method opens + // in GzipIndexer reuse DBManager's cached handle. + std::optional db_keep_alive; + if (!index_path.empty()) { + try { + db_keep_alive.emplace(index_path, + rocksdb::RocksDatabase::OpenMode::ReadOnly); + } catch (...) { + } + } + + LinePrefilter prefilter = query ? build_prefilter(*query) : LinePrefilter{}; + auto range_type = config.has_line_range() ? internal::RangeType::LINE_RANGE + : internal::RangeType::BYTE_RANGE; + std::size_t start = + config.has_line_range() ? config.start_line : config.start_byte; + std::size_t end = + config.has_line_range() ? config.end_line : config.end_byte; + + if (range_type == internal::RangeType::LINE_RANGE) { + auto total_lines = reader->get_num_lines(); + if (start == 0) start = 1; + if (end == 0 || end > total_lines) end = total_lines; + if (start > total_lines) co_return; + } else { + auto max_bytes = reader->get_max_bytes(); + if (end == 0 || end > max_bytes) end = max_bytes; + if (start >= max_bytes) co_return; + } + + if (query && !index_path.empty() && + range_type == internal::RangeType::BYTE_RANGE) { + ChunkPrunerInput pruner_input{index_path, file_path, *query, nullptr}; + ChunkPrunerUtility pruner; + auto pruner_out = co_await pruner.process(pruner_input); + if (pruner_out.success && !pruner_out.file_may_match) { + co_return; + } + + if (pruner_out.success && !pruner_out.candidate_checkpoints.empty() && + pruner_out.candidate_checkpoints.size() < + pruner_out.total_checkpoints) { + indexer::IndexDatabase idx_db( + index_path, rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto logical = indexer::internal::get_logical_path(file_path); + int fid = idx_db.get_file_info_id(logical); + + if (fid >= 0) { + auto all_ckpts = idx_db.query_checkpoints(fid); + std::unordered_map + ckpt_map; + for (auto& ckpt : all_ckpts) { + ckpt_map.emplace(ckpt.checkpoint_idx, std::move(ckpt)); + } + + std::vector ranges; + std::uint64_t prev_idx = UINT64_MAX; + + for (auto ckpt_idx : pruner_out.candidate_checkpoints) { + auto it = ckpt_map.find(ckpt_idx); + if (it == ckpt_map.end()) continue; + const auto& ckpt = it->second; + + if (ranges.empty() || ckpt_idx != prev_idx + 1) { + ranges.push_back( + {ckpt.first_line_num, ckpt.last_line_num}); + } else { + ranges.back().end_line = ckpt.last_line_num; + } + prev_idx = ckpt_idx; + } + + auto gen = yield_lines_from_ranges(reader, std::move(ranges), + config.buffer_size, *query, + chunk_prune_only, prefilter); + while (auto line = co_await gen.next()) { + co_yield *line; + } + co_return; + } + } + } + + auto stream = + reader->stream(internal::StreamConfig() + .stream_type(internal::StreamType::MULTI_LINES) + .range_type(range_type) + .from(start) + .to(end) + .buffer_size(config.buffer_size)); + + auto gen = yield_lines_from_stream(std::move(stream), start, + query ? &*query : nullptr, + chunk_prune_only, &prefilter); + while (auto line = co_await gen.next()) { + co_yield *line; + } +} + +coro::AsyncGenerator read_lines_gz(std::string file_path, + ReadConfig config, + std::optional query, + bool chunk_prune_only = false) { + std::size_t start = config.has_line_range() ? config.start_line : 0; + std::size_t end = config.has_line_range() ? config.end_line : 0; + auto gen = + fileio::lines::sources::async_streaming_gz_lines(file_path, start, end); + while (auto opt = co_await gen.next()) { + if (chunk_prune_only || !query || + line_matches_query(*query, opt->content)) { + co_yield *opt; + } + } +} + +coro::AsyncGenerator read_lines_plain_bytes( + std::string file_path, ReadConfig config, std::optional query, + bool chunk_prune_only = false) { + auto gen = fileio::lines::sources::async_plain_file_bytes( + file_path, config.start_byte, config.end_byte, config.buffer_size); + while (auto opt = co_await gen.next()) { + if (chunk_prune_only || !query || + line_matches_query(*query, opt->content)) { + co_yield *opt; + } + } +} + +coro::AsyncGenerator read_lines_plain(std::string file_path, + ReadConfig config, + std::optional query, + bool chunk_prune_only = false) { + std::size_t start = config.has_line_range() ? config.start_line : 0; + std::size_t end = config.has_line_range() ? config.end_line : 0; + auto gen = + fileio::lines::sources::async_plain_file_lines(file_path, start, end); + while (auto opt = co_await gen.next()) { + if (chunk_prune_only || !query || + line_matches_query(*query, opt->content)) { + co_yield *opt; + } } - yyjson_doc_free(doc); - return result; } } // namespace @@ -119,104 +786,223 @@ coro::AsyncGenerator TraceReader::read_lines(ReadConfig config) { query = std::move(*parsed); } + bool cpo = config.chunk_prune_only; + if (has_index_) { - auto reader = create_indexed_reader(); - auto range_type = resolve_range_type(config); - std::size_t start = - config.has_line_range() ? config.start_line : config.start_byte; - std::size_t end = - config.has_line_range() ? config.end_line : config.end_byte; + return read_lines_indexed(create_indexed_reader(), index_path_, + config_.file_path, std::move(config), + std::move(query), cpo); + } + if (format_ == ArchiveFormat::GZIP || format_ == ArchiveFormat::TAR_GZ) { + return read_lines_gz(config_.file_path, std::move(config), + std::move(query), cpo); + } + if (config.has_byte_range()) { + return read_lines_plain_bytes(config_.file_path, std::move(config), + std::move(query), cpo); + } + return read_lines_plain(config_.file_path, std::move(config), + std::move(query), cpo); +} - if (range_type == internal::RangeType::LINE_RANGE) { - auto total_lines = reader->get_num_lines(); - if (start == 0) start = 1; - if (end == 0 || end > total_lines) end = total_lines; - if (start > total_lines) co_return; - } else { - auto max_bytes = reader->get_max_bytes(); - if (end == 0 || end > max_bytes) end = max_bytes; - if (start >= max_bytes) co_return; - } +namespace { - if (has_index_ && query && !index_path_.empty() && - range_type == internal::RangeType::BYTE_RANGE) { - ChunkPrunerInput pruner_input{index_path_, config_.file_path, - *query, nullptr}; - ChunkPrunerUtility pruner; - auto pruner_out = co_await pruner.process(pruner_input); - if (pruner_out.success && !pruner_out.file_may_match) { - co_return; +common::query::LiteralValue ondemand_to_literal(simdjson::ondemand::value val) { + auto type = val.type().value_unsafe(); + switch (type) { + case simdjson::ondemand::json_type::string: { + auto r = val.get_string(); + if (!r.error()) return std::string(r.value_unsafe()); + break; + } + case simdjson::ondemand::json_type::number: { + auto num = val.get_number(); + if (!num.error()) { + auto n = num.value_unsafe(); + if (n.is_int64()) return n.get_int64(); + if (n.is_uint64()) return n.get_uint64(); + return n.get_double(); } + break; + } + case simdjson::ondemand::json_type::boolean: { + auto r = val.get_bool(); + if (!r.error()) return r.value_unsafe(); + break; } + default: + break; + } + return std::string{}; +} - auto stream = - reader->stream(internal::StreamConfig() - .stream_type(internal::StreamType::MULTI_LINES) - .range_type(range_type) - .from(start) - .to(end) - .buffer_size(config.buffer_size)); +} // namespace - std::size_t line_num = start; - while (!stream->done()) { - auto chunk = co_await stream->read_async(); - if (chunk.empty()) break; - const char* data = chunk.data(); - std::size_t len = chunk.size(); - std::size_t pos = 0; - while (pos < len) { - const void* nl_ptr = std::memchr(data + pos, '\n', len - pos); - std::size_t end_pos = - nl_ptr ? static_cast(nl_ptr) - data : len; - if (end_pos > pos) { - auto line_sv = std::string_view(data + pos, end_pos - pos); - if (!query || line_matches_query(*query, line_sv)) { - co_yield Line(line_sv, line_num); +coro::AsyncGenerator TraceReader::read_json(ReadConfig config) { + std::optional query; + if (!config.query.empty()) { + auto parsed = Query::from_string(config.query); + if (!parsed) throw common::query::QueryParseError(parsed.error()); + query = std::move(*parsed); + } + + // chunk_prune_only path: dim_stats already proved every event with the + // predicate field matches; we still need to skip events lacking the + // field (e.g., metadata "ph":"M" events). Field-presence probe is + // cheaper than full ValueMap eval. + std::vector presence_check_paths; + if (query && config.chunk_prune_only) { + const auto& fset = query->fields(); + presence_check_paths.assign(fset.begin(), fset.end()); + } + + // Fast path: indexed gz files go through a chunk generator with + // simdjson iterate_many. Query is evaluated on the ondemand document + // directly, so non-matching docs never hit the yield_parser. + if (has_index_) { + auto reader = create_indexed_reader(); + auto chunk_gen = read_chunks_indexed(reader, index_path_, + config_.file_path, config, query); + + simdjson::ondemand::parser bulk_parser; + common::json::JsonParser yield_parser; + + while (auto chunk_opt = co_await chunk_gen.next()) { + auto chunk = *chunk_opt; + if (chunk.empty()) continue; + auto trimmed = strip_ndjson_bookends( + std::string_view(chunk.data(), chunk.size())); + if (trimmed.empty()) continue; + simdjson::padded_string padded(trimmed); + + auto docs_r = bulk_parser.iterate_many( + padded, 1 << 20, /*allow_comma_separated=*/false); + if (docs_r.error()) continue; + auto& docs = docs_r.value(); + + for (auto it = docs.begin(); it != docs.end(); ++it) { + auto doc_result = *it; + if (doc_result.error()) continue; + auto& doc = doc_result.value(); + + std::string_view src(it.source().data(), it.source().size()); + + if (query && config.chunk_prune_only) { + bool all_present = true; + for (const auto& path : presence_check_paths) { + auto fld = doc.find_field_unordered(path); + if (fld.error()) { + all_present = false; + break; + } } - ++line_num; - } else { - ++line_num; + if (!all_present) continue; + doc.rewind(); + } else if (query) { + common::query::ValueMap fields; + auto obj = doc.get_object(); + if (obj.error()) continue; + for (auto field : obj.value()) { + if (field.error()) continue; + auto key_r = field.unescaped_key(); + if (key_r.error()) continue; + auto val_r = field.value(); + if (val_r.error()) continue; + auto key = key_r.value(); + auto val = val_r.value(); + auto type_r = val.type(); + if (type_r.error()) continue; + auto type = type_r.value(); + if (type == simdjson::ondemand::json_type::object) { + auto nested = val.get_object(); + if (nested.error()) continue; + for (auto nf : nested.value()) { + if (nf.error()) continue; + auto nk_r = nf.unescaped_key(); + if (nk_r.error()) continue; + auto nv_r = nf.value(); + if (nv_r.error()) continue; + auto nk = nk_r.value(); + if (!query->references(nk)) continue; + fields[std::string(nk)] = + ondemand_to_literal(nv_r.value()); + } + } else if (query->references(key)) { + fields[std::string(key)] = ondemand_to_literal(val); + } + } + if (!query->evaluate(fields)) continue; } - pos = end_pos + 1; - } - } - } else if (format_ == ArchiveFormat::GZIP || - format_ == ArchiveFormat::TAR_GZ) { - std::size_t start = config.has_line_range() ? config.start_line : 0; - std::size_t end = config.has_line_range() ? config.end_line : 0; - auto gen = fileio::lines::sources::async_streaming_gz_lines( - config_.file_path, start, end); - while (auto opt = co_await gen.next()) { - if (!query || line_matches_query(*query, opt->content)) { - co_yield *opt; - } - } - } else if (config.has_byte_range()) { - // Plain file with byte range - auto gen = fileio::lines::sources::async_plain_file_bytes( - config_.file_path, config.start_byte, config.end_byte, - config.buffer_size); - while (auto opt = co_await gen.next()) { - if (!query || line_matches_query(*query, opt->content)) { - co_yield *opt; + + // Matched (or no query): lend the iterate_many doc to + // yield_parser without re-parsing. Consumers like + // build_arrow_row call parser.for_each_field which now + // iterates the borrowed doc_reference. + doc.rewind(); + yield_parser.set_borrowed_document( + simdjson::ondemand::document_reference(doc)); + co_yield JsonLine{src, 0, &yield_parser}; } } - } else { - std::size_t start = config.has_line_range() ? config.start_line : 0; - std::size_t end = config.has_line_range() ? config.end_line : 0; - auto gen = fileio::lines::sources::async_plain_file_lines( - config_.file_path, start, end); - while (auto opt = co_await gen.next()) { - if (!query || line_matches_query(*query, opt->content)) { - co_yield *opt; + co_return; + } + + // Fallback: non-indexed paths use the per-line pipeline unchanged. + config.chunk_prune_only = true; + auto line_gen = read_lines(config); + + common::json::JsonParser parser; + + while (auto opt = co_await line_gen.next()) { + const char* trimmed; + std::size_t trimmed_len; + if (!dftracer::utils::json_trim_and_validate_with_comma( + opt->content.data(), opt->content.size(), trimmed, trimmed_len)) + continue; + if (!parser.parse(std::string_view(trimmed, trimmed_len))) continue; + + if (query) { + common::query::ValueMap fields; + std::vector nested_keys; + parser.for_each_field( + [&](std::string_view key, simdjson::ondemand::value val) { + auto type = val.type().value_unsafe(); + if (type == simdjson::ondemand::json_type::object) { + nested_keys.emplace_back(key); + } else if (query->references(key)) { + fields[std::string(key)] = ondemand_to_literal(val); + } + }); + for (auto& nk : nested_keys) { + parser.rewind(); + parser.for_each_field(nk, [&](std::string_view key, + simdjson::ondemand::value val) { + if (query->references(key)) { + fields[std::string(key)] = ondemand_to_literal(val); + } + }); } + if (!query->evaluate(fields)) continue; + parser.rewind(); } + + co_yield JsonLine{opt->content, opt->line_number, &parser}; } } coro::AsyncGenerator> TraceReader::read_raw( ReadConfig config) { if (has_index_) { + // Keep RocksDB alive for the generator's lifetime so per-method + // opens in GzipIndexer reuse DBManager's cached handle. + std::optional db_keep_alive; + if (!index_path_.empty()) { + try { + db_keep_alive.emplace( + index_path_, rocksdb::RocksDatabase::OpenMode::ReadOnly); + } catch (...) { + } + } auto reader = create_indexed_reader(); auto stream_type = resolve_raw_stream_type(config); auto range_type = resolve_range_type(config); @@ -236,7 +1022,7 @@ coro::AsyncGenerator> TraceReader::read_raw( if (start >= max_bytes) co_return; } - if (has_index_ && !config.query.empty() && !index_path_.empty() && + if (!config.query.empty() && !index_path_.empty() && range_type == internal::RangeType::BYTE_RANGE) { auto parsed = Query::from_string(config.query); if (!parsed) throw common::query::QueryParseError(parsed.error()); @@ -293,4 +1079,510 @@ coro::AsyncGenerator> TraceReader::read_raw( } } +#ifdef DFTRACER_UTILS_ENABLE_ARROW + +namespace { + +using common::arrow::ArrowExportResult; +using common::arrow::ColumnType; +using common::arrow::RecordBatchBuilder; + +// Bump arena for string_views that must survive until builder.finish(). +struct ArrowStringArena { + static constexpr std::size_t BLOCK_SIZE = 64 * 1024; + std::vector> blocks; + std::size_t pos = 0; + + ArrowStringArena() { blocks.emplace_back(BLOCK_SIZE); } + + std::string_view push(const char* data, std::size_t len) { + if (pos + len > blocks.back().size()) { + blocks.emplace_back(std::max(BLOCK_SIZE, len)); + pos = 0; + } + char* dst = blocks.back().data() + pos; + std::memcpy(dst, data, len); + pos += len; + return {dst, len}; + } + + void clear() { + if (blocks.size() > 1) blocks.resize(1); + pos = 0; + } +}; + +struct ArrowKeyHint { + std::string key; + std::size_t col_idx = 0; + ColumnType type = ColumnType::INT64; + bool valid = false; +}; + +inline std::size_t resolve_col_idx(RecordBatchBuilder& builder, + std::vector& hints, + std::size_t pos, std::string_view key_sv, + ColumnType type) { + if (pos < hints.size()) { + auto& h = hints[pos]; + if (h.valid && h.type == type && h.key.size() == key_sv.size() && + std::memcmp(h.key.data(), key_sv.data(), key_sv.size()) == 0) { + return h.col_idx; + } + } + // Position-keyed miss. Variable-shape rows (e.g., open vs read events + // with different args fields) push fields to different positions, so + // the position cache misses constantly while the underlying schema is + // small (~15 keys). A linear scan over the hint vector with a SIMD + // memcmp beats RecordBatchBuilder's name_to_index_ hash lookup for this + // size. + for (std::size_t i = 0; i < hints.size(); ++i) { + if (i == pos) continue; + auto& h = hints[i]; + if (h.valid && h.type == type && h.key.size() == key_sv.size() && + std::memcmp(h.key.data(), key_sv.data(), key_sv.size()) == 0) { + if (pos < hints.size()) { + auto& slot = hints[pos]; + slot.key.assign(key_sv); + slot.type = type; + slot.col_idx = h.col_idx; + slot.valid = true; + } + return h.col_idx; + } + } + std::size_t idx = builder.add_or_get_column(key_sv, type); + if (pos >= hints.size()) hints.resize(pos + 1); + auto& h = hints[pos]; + h.key.assign(key_sv); + h.type = type; + h.col_idx = idx; + h.valid = true; + return idx; +} + +// Append a typed scalar value under `key_sv`. Nested objects/arrays are +// always round-tripped as JSON strings (flattening is one level only). +void append_scalar_or_json(RecordBatchBuilder& builder, + std::vector& hints, std::size_t& pos, + std::string_view key_sv, + simdjson::ondemand::value val, + simdjson::ondemand::json_type type) { + switch (type) { + case simdjson::ondemand::json_type::number: { + auto num_r = val.get_number(); + if (num_r.error()) break; + auto num = num_r.value(); + if (num.is_int64()) { + auto idx = resolve_col_idx(builder, hints, pos++, key_sv, + ColumnType::INT64); + builder.append_int64(idx, num.get_int64()); + } else if (num.is_uint64()) { + auto idx = resolve_col_idx(builder, hints, pos++, key_sv, + ColumnType::UINT64); + builder.append_uint64(idx, num.get_uint64()); + } else { + auto idx = resolve_col_idx(builder, hints, pos++, key_sv, + ColumnType::DOUBLE); + builder.append_double(idx, num.get_double()); + } + break; + } + case simdjson::ondemand::json_type::string: { + auto str_r = val.get_string(); + if (str_r.error()) break; + auto idx = resolve_col_idx(builder, hints, pos++, key_sv, + ColumnType::STRING); + builder.append_string(idx, str_r.value()); + break; + } + case simdjson::ondemand::json_type::boolean: { + auto b_r = val.get_bool(); + if (b_r.error()) break; + auto idx = resolve_col_idx(builder, hints, pos++, key_sv, + ColumnType::BOOL); + builder.append_bool(idx, b_r.value()); + break; + } + case simdjson::ondemand::json_type::null: { + auto existing = builder.find_column(key_sv); + if (existing) builder.append_null(*existing); + ++pos; + break; + } + case simdjson::ondemand::json_type::object: + case simdjson::ondemand::json_type::array: { + auto raw_r = val.raw_json(); + auto idx = resolve_col_idx(builder, hints, pos++, key_sv, + ColumnType::STRING); + if (!raw_r.error()) { + auto sv = raw_r.value(); + builder.append_string(idx, sv); + } else { + builder.append_null(idx); + } + break; + } + default: + ++pos; + break; + } +} + +// Append one Arrow row from an already-parsed simdjson document. +// Dynamic schema: new columns appended as they appear. When flatten_objects +// is true, top-level object values are expanded one level into `parent.child` +// columns; deeper nesting still lands as a JSON string under the flattened +// key. Returns false on error paths so callers can skip the row. +bool arrow_row_from_doc(RecordBatchBuilder& builder, + std::vector& hints, + simdjson::ondemand::document_reference doc, + bool flatten_objects = false) { + auto obj_result = doc.get_object(); + if (obj_result.error()) return false; + char key_buf[512]; + std::size_t pos = 0; + for (auto field : obj_result.value()) { + if (field.error()) continue; + auto key_r = field.unescaped_key(); + if (key_r.error()) continue; + auto key_sv = key_r.value(); + auto val_r = field.value(); + if (val_r.error()) continue; + auto val = val_r.value(); + auto type_r = val.type(); + if (type_r.error()) continue; + auto type = type_r.value(); + + if (flatten_objects && type == simdjson::ondemand::json_type::object) { + auto nested = val.get_object(); + if (nested.error()) continue; + for (auto nf : nested.value()) { + if (nf.error()) continue; + auto nk_r = nf.unescaped_key(); + if (nk_r.error()) continue; + auto nk = nk_r.value(); + auto nv_r = nf.value(); + if (nv_r.error()) continue; + auto nv = nv_r.value(); + auto nt_r = nv.type(); + if (nt_r.error()) continue; + std::size_t needed = key_sv.size() + 1 + nk.size(); + if (needed >= sizeof(key_buf)) continue; + std::memcpy(key_buf, key_sv.data(), key_sv.size()); + key_buf[key_sv.size()] = '.'; + std::memcpy(key_buf + key_sv.size() + 1, nk.data(), nk.size()); + append_scalar_or_json(builder, hints, pos, + std::string_view(key_buf, needed), nv, + nt_r.value()); + } + continue; + } + + append_scalar_or_json(builder, hints, pos, key_sv, val, type); + } + builder.end_row(); + return true; +} + +void collect_query_fields(simdjson::ondemand::document_reference doc, + const Query& query, common::query::ValueMap& out); + +// Run iterate_many over `padded`, build arrow rows, and emit completed +// batches via `yield_one`. Updates `carry` with the truncated tail (if any) +// for the caller to prepend to the next chunk. +template +void parse_padded_into_arrow(simdjson::ondemand::parser& bulk_parser, + simdjson::padded_string& padded, + const std::optional& query, bool flatten, + RecordBatchBuilder& builder, + ArrowStringArena& arena, + std::vector& hints, + std::size_t batch_size, std::string* carry, + Yield&& yield_one) { + auto docs_r = bulk_parser.iterate_many(padded, 1 << 20, false); + if (docs_r.error()) { + if (carry) carry->clear(); + return; + } + auto& docs = docs_r.value(); + for (auto it = docs.begin(); it != docs.end(); ++it) { + auto doc_result = *it; + if (doc_result.error()) continue; + auto& doc = doc_result.value(); + if (query) { + common::query::ValueMap fields; + collect_query_fields(doc, *query, fields); + if (!query->evaluate(fields)) continue; + doc.rewind(); + } + if (!arrow_row_from_doc(builder, hints, doc, flatten)) continue; + if (builder.num_rows() >= batch_size) { + auto result = builder.finish(); + arena.clear(); + if (!builder.is_schema_locked()) builder.lock_schema(); + builder.reset(true); + builder.reserve(batch_size); + yield_one(std::move(result)); + } + } + if (carry) { + std::size_t total = padded.size(); + std::size_t truncated = docs.truncated_bytes(); + if (truncated > 0 && truncated <= total) { + carry->assign(padded.data() + total - truncated, + padded.data() + total); + } else { + carry->clear(); + } + } +} + +// Build a simdjson-padded buffer containing only the lines in `chunk` that +// pass the line-level prefilter. For queries with no useful prefilter, the +// caller should skip this and feed the raw chunk directly. +std::string collect_matching_lines(std::span chunk, + const LinePrefilter& prefilter) { + std::string out; + out.reserve(chunk.size()); + const char* data = chunk.data(); + std::size_t len = chunk.size(); + std::size_t pos = 0; + while (pos < len) { + const void* nl = std::memchr(data + pos, '\n', len - pos); + std::size_t end_pos = nl ? static_cast(nl) - data : len; + if (end_pos > pos) { + std::string_view line(data + pos, end_pos - pos); + if (prefilter.may_match(line)) { + out.append(line); + out.push_back('\n'); + } + } + pos = end_pos + 1; + } + return out; +} + +// Extract fields referenced by the query into a ValueMap, walking one level +// of object nesting. Fields not referenced by the query are skipped. +void collect_query_fields(simdjson::ondemand::document_reference doc, + const Query& query, common::query::ValueMap& out) { + auto obj = doc.get_object(); + if (obj.error()) return; + for (auto field : obj.value()) { + if (field.error()) continue; + auto key_r = field.unescaped_key(); + if (key_r.error()) continue; + auto val_r = field.value(); + if (val_r.error()) continue; + auto key = key_r.value(); + auto val = val_r.value(); + auto type_r = val.type(); + if (type_r.error()) continue; + auto type = type_r.value(); + if (type == simdjson::ondemand::json_type::object) { + auto nested = val.get_object(); + if (nested.error()) continue; + for (auto nf : nested.value()) { + if (nf.error()) continue; + auto nk_r = nf.unescaped_key(); + if (nk_r.error()) continue; + auto nv_r = nf.value(); + if (nv_r.error()) continue; + if (!query.references(nk_r.value())) continue; + out[std::string(nk_r.value())] = + ondemand_to_literal(nv_r.value()); + } + } else if (query.references(key)) { + out[std::string(key)] = ondemand_to_literal(val); + } + } +} + +} // namespace + +coro::AsyncGenerator TraceReader::read_arrow( + ReadConfig config, std::size_t batch_size) { + std::optional query; + if (!config.query.empty()) { + auto parsed = Query::from_string(config.query); + if (!parsed) throw common::query::QueryParseError(parsed.error()); + query = std::move(*parsed); + } + + // When chunk_prune_only is set, dim_stats already proved every event in + // the chunk that has the predicate field matches the literal. We still + // need to skip events that lack the field (e.g., metadata "ph":"M" + // events lack pid), since the original predicate would reject them. + std::vector presence_check_paths; + if (query && config.chunk_prune_only) { + const auto& fset = query->fields(); + presence_check_paths.assign(fset.begin(), fset.end()); + } + + // For AND-of-EQ predicates, evaluate directly against simdjson without + // ValueMap (avoids wyhash + per-field std::string allocation per row). + // Falls back to the ValueMap path on unsupported AST shapes. + std::vector compiled_probes; + bool use_compiled = false; + if (query && !config.chunk_prune_only) { + if (auto p = try_compile_eq_probes(query->root())) { + compiled_probes = std::move(*p); + use_compiled = !compiled_probes.empty(); + } + } + + bool flatten = config.flatten_objects; + + if (!has_index_) { + // Fallback: drive the per-line read_json path and build rows. + auto json_gen = read_json(config); + RecordBatchBuilder builder; + ArrowStringArena arena; + std::vector hints; + builder.reserve(batch_size); + while (auto opt = co_await json_gen.next()) { + if (!arrow_row_from_doc(builder, hints, + simdjson::ondemand::document_reference( + opt->parser->raw_document()), + flatten)) + continue; + if (builder.num_rows() >= batch_size) { + co_yield builder.finish(); + arena.clear(); + if (!builder.is_schema_locked()) builder.lock_schema(); + builder.reset(true); + builder.reserve(batch_size); + } + } + if (builder.num_rows() > 0) { + co_yield builder.finish(); + } + co_return; + } + + // Keep RocksDB alive for the generator's lifetime so per-method opens + // in GzipIndexer reuse DBManager's cached handle. + std::optional db_keep_alive; + if (has_index_ && !index_path_.empty()) { + try { + db_keep_alive.emplace(index_path_, + rocksdb::RocksDatabase::OpenMode::ReadOnly); + } catch (...) { + } + } + + auto reader = create_indexed_reader(); + auto chunk_gen = read_chunks_indexed( + reader, index_path_, config_.file_path, config, query, + /*extend_to_line_boundary=*/config.end_at_checkpoint); + + LinePrefilter prefilter = (query && !config.chunk_prune_only) + ? build_prefilter(*query) + : LinePrefilter{}; + bool have_line_prefilter = !prefilter.empty(); + + simdjson::ondemand::parser bulk_parser; + RecordBatchBuilder builder; + ArrowStringArena arena; + std::vector hints; + builder.reserve(batch_size); + + auto maybe_flush = [&builder, &arena, batch_size]( + bool final) -> std::optional { + if (builder.num_rows() == 0) return std::nullopt; + if (!final && builder.num_rows() < batch_size) return std::nullopt; + auto result = builder.finish(); + arena.clear(); + if (!builder.is_schema_locked()) builder.lock_schema(); + builder.reset(true); + builder.reserve(batch_size); + return result; + }; + + bool first_chunk = true; + while (auto chunk_opt = co_await chunk_gen.next()) { + auto chunk = *chunk_opt; + if (chunk.empty()) continue; + + // Work items with start_byte > 0 begin at a deflate-block boundary + // that is typically mid-line; the previous worker emitted that + // spanning line via its tail-flush, so drop bytes up to (and + // including) the first newline in our first chunk. + if (first_chunk && config.start_byte > 0 && + config.start_at_checkpoint) { + const char* nl = static_cast( + std::memchr(chunk.data(), '\n', chunk.size())); + if (nl) { + std::size_t skip = + static_cast(nl - chunk.data()) + 1; + if (skip < chunk.size()) { + chunk = chunk.subspan(skip); + } else { + first_chunk = false; + continue; + } + } + } + first_chunk = false; + + simdjson::padded_string padded; + if (have_line_prefilter) { + auto collected = collect_matching_lines(chunk, prefilter); + if (collected.empty()) continue; + padded = simdjson::padded_string(std::move(collected)); + } else { + auto trimmed = strip_ndjson_bookends( + std::string_view(chunk.data(), chunk.size())); + if (trimmed.empty()) continue; + padded = simdjson::padded_string(trimmed); + } + + auto docs_r = bulk_parser.iterate_many(padded, 1 << 20, + /*allow_comma_separated=*/false); + if (docs_r.error()) continue; + auto& docs = docs_r.value(); + + for (auto it = docs.begin(); it != docs.end(); ++it) { + auto doc_result = *it; + if (doc_result.error()) continue; + auto& doc = doc_result.value(); + + if (query && !config.chunk_prune_only) { + if (use_compiled) { + if (!eval_compiled_eq(compiled_probes, doc)) continue; + } else { + common::query::ValueMap fields; + collect_query_fields(doc, *query, fields); + if (!query->evaluate(fields)) continue; + } + doc.rewind(); + } else if (!presence_check_paths.empty()) { + bool all_present = true; + for (const auto& path : presence_check_paths) { + auto fld = doc.find_field_unordered(path); + if (fld.error()) { + all_present = false; + break; + } + } + if (!all_present) continue; + doc.rewind(); + } + + if (!arrow_row_from_doc(builder, hints, doc, flatten)) continue; + + if (auto flushed = maybe_flush(/*final=*/false)) { + co_yield std::move(*flushed); + } + } + } + + if (auto flushed = maybe_flush(/*final=*/true)) { + co_yield std::move(*flushed); + } +} + +#endif // DFTRACER_UTILS_ENABLE_ARROW + } // namespace dftracer::utils::utilities::reader diff --git a/src/dftracer/utils/utilities/replay/replay.cpp b/src/dftracer/utils/utilities/replay/replay.cpp index 2b34e22b..753af3e4 100644 --- a/src/dftracer/utils/utilities/replay/replay.cpp +++ b/src/dftracer/utils/utilities/replay/replay.cpp @@ -1,22 +1,21 @@ #include #include #include +#include +#include +#include #include #include -#include +#include #include -#include #include #include #include -#include #include #include +#include #include -#include -#include -#include #include #include @@ -24,117 +23,16 @@ namespace dftracer::utils::utilities::replay { namespace { -/** - * Trim whitespace and validate JSON string - */ -bool json_trim_and_validate(const char* input, std::size_t input_length, - const char*& trimmed, std::size_t& trimmed_length) { - if (!input || input_length == 0) { - return false; - } - - // Trim leading whitespace - std::size_t start = 0; - while (start < input_length && - (input[start] == ' ' || input[start] == '\t' || - input[start] == '\n' || input[start] == '\r')) { - start++; - } - - // Trim trailing whitespace and comma - std::size_t end = input_length; - while (end > start && (input[end - 1] == ' ' || input[end - 1] == '\t' || - input[end - 1] == '\n' || input[end - 1] == '\r' || - input[end - 1] == ',')) { - end--; - } - - if (start >= end) { - return false; - } - - trimmed = input + start; - trimmed_length = end - start; - - // Basic validation: must start with '{' and end with '}' - if (trimmed[0] != '{' || trimmed[trimmed_length - 1] != '}') { - return false; - } - - return true; +// Process-wide intern pool for replay strings. Function names, categories, +// and per-file hashes (fhash/hhash) have either small bounded cardinality +// (~tens for cat/name) or stable identity per file (fhash). +dftracer::utils::StringIntern& replay_intern() { + static dftracer::utils::StringIntern instance; + return instance; } -/** - * Parse a JSON string value from yyjson - */ -std::string get_json_string(yyjson_val* val, const char* key, - const std::string& default_value = "") { - yyjson_val* field = yyjson_obj_get(val, key); - if (field && yyjson_is_str(field)) { - return yyjson_get_str(field); - } - return default_value; -} - -/** - * Parse a JSON uint64 value from yyjson - */ -std::uint64_t get_json_uint64(yyjson_val* val, const char* key, - std::uint64_t default_value = 0) { - yyjson_val* field = yyjson_obj_get(val, key); - if (field && yyjson_is_uint(field)) { - return yyjson_get_uint(field); - } else if (field && yyjson_is_int(field)) { - std::int64_t int_val = yyjson_get_int(field); - return int_val >= 0 ? static_cast(int_val) - : default_value; - } - return default_value; -} - -/** - * Parse a JSON double value from yyjson - */ -double get_json_double(yyjson_val* val, const char* key, - double default_value = 0.0) { - yyjson_val* field = yyjson_obj_get(val, key); - if (field && yyjson_is_real(field)) { - return yyjson_get_real(field); - } else if (field && yyjson_is_uint(field)) { - return static_cast(yyjson_get_uint(field)); - } else if (field && yyjson_is_int(field)) { - return static_cast(yyjson_get_int(field)); - } - return default_value; -} - -/** - * Get a string value from args object - */ -std::string get_args_string(yyjson_val* root, const char* key, - const std::string& default_value = "") { - yyjson_val* args = yyjson_obj_get(root, "args"); - if (args && yyjson_is_obj(args)) { - return get_json_string(args, key, default_value); - } - return default_value; -} - -/** - * Get an int64 value from args object - */ -std::int64_t get_args_int64(yyjson_val* root, const char* key, - std::int64_t default_value = 0) { - yyjson_val* args = yyjson_obj_get(root, "args"); - if (args && yyjson_is_obj(args)) { - yyjson_val* field = yyjson_obj_get(args, key); - if (field && yyjson_is_int(field)) { - return yyjson_get_int(field); - } else if (field && yyjson_is_uint(field)) { - return static_cast(yyjson_get_uint(field)); - } - } - return default_value; +std::string_view intern_sv(std::string_view sv) { + return replay_intern().intern(sv); } /** @@ -161,11 +59,12 @@ bool ensure_directory_exists(const std::string& path) { // ============================================================================= bool PosixExecutor::execute(const Trace& trace, const ReplayConfig& config) { - const std::string& func_name = trace.func_name; + std::string_view func_name = trace.func_name; if (config.dry_run) { - DFTRACER_UTILS_LOG_DEBUG("DRY RUN: Would execute POSIX %s", - func_name.c_str()); + DFTRACER_UTILS_LOG_DEBUG("DRY RUN: Would execute POSIX %.*s", + static_cast(func_name.size()), + func_name.data()); return true; } @@ -186,8 +85,9 @@ bool PosixExecutor::execute(const Trace& trace, const ReplayConfig& config) { return execute_stat(trace, config); } - DFTRACER_UTILS_LOG_DEBUG("Unsupported POSIX function: %s", - func_name.c_str()); + DFTRACER_UTILS_LOG_DEBUG("Unsupported POSIX function: %.*s", + static_cast(func_name.size()), + func_name.data()); return false; } @@ -200,10 +100,17 @@ bool PosixExecutor::execute_open(const Trace& trace, DFTRACER_UTILS_LOG_DEBUG("Executing POSIX open"); if (!trace.fhash.empty()) { - std::string file_path = - config.output_directory.empty() - ? ("replay_file_" + trace.fhash) - : (config.output_directory + "/replay_file_" + trace.fhash); + std::string file_path; + if (config.output_directory.empty()) { + file_path.reserve(12 + trace.fhash.size()); + file_path = "replay_file_"; + } else { + file_path.reserve(config.output_directory.size() + 13 + + trace.fhash.size()); + file_path = config.output_directory; + file_path += "/replay_file_"; + } + file_path.append(trace.fhash.data(), trace.fhash.size()); ensure_directory_exists(file_path); @@ -231,13 +138,20 @@ bool PosixExecutor::execute_close(const Trace& trace, if (it != open_files_.end()) { close(it->second); open_files_.erase(it); - DFTRACER_UTILS_LOG_DEBUG("Closed file with hash %s", - trace.fhash.c_str()); + DFTRACER_UTILS_LOG_DEBUG("Closed file with hash %.*s", + static_cast(trace.fhash.size()), + trace.fhash.data()); } return true; } +void PosixExecutor::ensure_io_buffer(std::size_t size) { + if (io_buffer_.size() < size) { + io_buffer_.resize(size, 'A'); + } +} + bool PosixExecutor::execute_read(const Trace& trace, const ReplayConfig& config) { DFTRACER_UTILS_LOG_DEBUG("Executing POSIX read (size: %lld)", @@ -245,10 +159,11 @@ bool PosixExecutor::execute_read(const Trace& trace, auto it = open_files_.find(trace.fhash); if (it != open_files_.end() && trace.size > 0) { - std::vector buffer(std::min(static_cast(trace.size), - config.max_file_size)); + std::size_t n = std::min(static_cast(trace.size), + config.max_file_size); + ensure_io_buffer(n); [[maybe_unused]] ssize_t bytes_read = - read(it->second, buffer.data(), buffer.size()); + read(it->second, io_buffer_.data(), n); DFTRACER_UTILS_LOG_DEBUG("Read %zd bytes", bytes_read); } @@ -264,9 +179,9 @@ bool PosixExecutor::execute_write(const Trace& trace, if (it != open_files_.end() && trace.size > 0) { std::size_t write_size = std::min(static_cast(trace.size), config.max_file_size); - std::vector buffer(write_size, 'A'); + ensure_io_buffer(write_size); [[maybe_unused]] ssize_t bytes_written = - write(it->second, buffer.data(), buffer.size()); + write(it->second, io_buffer_.data(), write_size); DFTRACER_UTILS_LOG_DEBUG("Wrote %zd bytes", bytes_written); } @@ -295,8 +210,9 @@ bool PosixExecutor::execute_stat([[maybe_unused]] const Trace& trace, DFTRACER_UTILS_LOG_DEBUG("Executing POSIX stat"); if (!trace.fhash.empty()) { - DFTRACER_UTILS_LOG_DEBUG("Would stat file with hash %s", - trace.fhash.c_str()); + DFTRACER_UTILS_LOG_DEBUG("Would stat file with hash %.*s", + static_cast(trace.fhash.size()), + trace.fhash.data()); } return true; @@ -322,16 +238,17 @@ bool DFTracerExecutor::execute(const Trace& trace, const ReplayConfig& config) { if (config.no_sleep) { if (config.verbose && duration_us >= 100000.0) { - std::cout << "DFTracer would sleep for " << std::fixed - << std::setprecision(3) << duration_us / 1000.0 - << " ms for " << trace.func_name << " (skipped)" - << std::endl; + std::printf("DFTracer would sleep for %.3f ms for %.*s (skipped)\n", + duration_us / 1000.0, + static_cast(trace.func_name.size()), + trace.func_name.data()); } } else { if (config.verbose && duration_us >= 100.0) { - std::cout << "DFTracer sleeping for " << std::fixed - << std::setprecision(3) << duration_us / 1000.0 - << " ms for " << trace.func_name << std::endl; + std::printf("DFTracer sleeping for %.3f ms for %.*s\n", + duration_us / 1000.0, + static_cast(trace.func_name.size()), + trace.func_name.data()); } sleep_for_duration(duration_us); } @@ -385,69 +302,106 @@ void ReplayEngine::add_executor(std::unique_ptr executor) { executors_.push_back(std::move(executor)); } -ReplayResult ReplayEngine::replay(const std::string& trace_file, - const std::string& index_file) { - ReplayResult result; +coro::AsyncGenerator ReplayEngine::stream_traces( + const std::vector& files) { + using reader::ReadConfig; + using reader::TraceReader; + using reader::TraceReaderConfig; + + for (const auto& file : files) { + TraceReaderConfig cfg; + cfg.file_path = file; + cfg.auto_build_index = true; + TraceReader rdr(std::move(cfg)); + auto gen = rdr.read_json(ReadConfig{}); + while (auto opt = co_await gen.next()) { + if (!opt->parser) continue; + Trace trace; + if (parse_trace_json(*opt->parser, trace)) { + co_yield std::move(trace); + } + } + } +} - DFTRACER_UTILS_LOG_DEBUG("Starting replay of file: %s", trace_file.c_str()); +coro::CoroTask ReplayEngine::run_pipelined( + dftracer::utils::CoroScope& scope, const std::vector& files, + ReplayResult& result, std::size_t channel_capacity) { + coro::Channel ch_instance(channel_capacity); + auto* channel = &ch_instance; + + co_await scope.scope([this, channel, &files, + &result](dftracer::utils::CoroScope& child) + -> coro::CoroTask { + // Producer + child.spawn([this, channel, &files]( + dftracer::utils::CoroScope&) -> coro::CoroTask { + auto producer = channel->producer(); + auto guard = producer.guard(); + auto gen = stream_traces(files); + while (auto trace = co_await gen.next()) { + if (!co_await producer.send(std::move(*trace))) { + co_return; + } + } + co_return; + }); + + // Consumer + child.spawn([this, channel, &result]( + dftracer::utils::CoroScope&) -> coro::CoroTask { + auto consumer = channel->consumer(); + while (auto item = co_await consumer.receive()) { + dispatch_trace(*item, result); + } + co_return; + }); + co_return; + }); - auto start_time = std::chrono::steady_clock::now(); + co_return; +} - try { - // Check if the file is compressed - bool is_compressed = - (trace_file.size() >= 3 && - trace_file.substr(trace_file.size() - 3) == ".gz") || - (trace_file.size() >= 7 && - trace_file.substr(trace_file.size() - 7) == ".tar.gz"); - - if (is_compressed) { - // Handle compressed files with ReaderFactory - std::string index_path = - index_file.empty() ? utilities::composites::dft::internal:: - determine_index_path(trace_file, "") - : index_file; - - auto reader = - reader::internal::ReaderFactory::create(trace_file, index_path); - - if (!reader) { - result.error_messages.push_back( - "Failed to create reader for file: " + trace_file); - return result; - } +namespace { - // Create line processor for handling trace lines - ReplayLineProcessor processor(*this, result); +// Sync drive used by the existing replay(file)/replay(vector) entry points. +// Pipeline-driven callers use ReplayEngine::run_pipelined instead. +coro::CoroTask replay_file_async(ReplayEngine* engine, + std::string trace_file, + std::string index_file, + ReplayResult* result) { + using reader::ReadConfig; + using reader::TraceReader; + using reader::TraceReaderConfig; + + TraceReaderConfig cfg; + cfg.file_path = std::move(trace_file); + if (!index_file.empty()) { + cfg.index_dir = std::move(index_file); + } + cfg.auto_build_index = true; + + TraceReader rdr(std::move(cfg)); + auto gen = rdr.read_json(ReadConfig{}); + while (auto opt = co_await gen.next()) { + if (!opt->parser) continue; + engine->process_trace_line(*opt->parser, *result); + } + co_return; +} - // Read all lines using the line processor - reader->read_lines_with_processor(0, reader->get_num_lines(), - processor); - } else { - // Handle plain text files directly - std::ifstream file(trace_file); - if (!file.is_open()) { - result.error_messages.push_back( - "Failed to open plain text file: " + trace_file); - return result; - } +} // namespace - std::string line; - while (std::getline(file, line)) { - // Skip empty lines and bracket lines - if (line.empty() || line == "[" || line == "]") { - continue; - } +ReplayResult ReplayEngine::replay(const std::string& trace_file, + const std::string& index_file) { + ReplayResult result; - // Remove trailing comma if present - if (!line.empty() && line.back() == ',') { - line.pop_back(); - } + DFTRACER_UTILS_LOG_DEBUG("Starting replay of file: %s", trace_file.c_str()); - process_trace_line(line, result); - } - } + auto start_time = std::chrono::steady_clock::now(); + try { + replay_file_async(this, trace_file, index_file, &result).get(); } catch (const std::exception& e) { result.error_messages.push_back("Exception during replay: " + std::string(e.what())); @@ -497,14 +451,17 @@ ReplayResult ReplayEngine::replay(const std::vector& trace_files) { return aggregated_result; } -bool ReplayEngine::process_trace_line(const std::string& line, +bool ReplayEngine::process_trace_line(common::json::JsonParser& parser, ReplayResult& result) { Trace trace; - - if (!parse_trace_json(line, trace)) { + if (!parse_trace_json(parser, trace)) { return false; } + dispatch_trace(trace, result); + return true; +} +void ReplayEngine::dispatch_trace(const Trace& trace, ReplayResult& result) { result.total_events++; result.function_counts[trace.func_name]++; result.category_counts[trace.cat]++; @@ -536,12 +493,12 @@ bool ReplayEngine::process_trace_line(const std::string& line, if (config_.max_events > 0 && result.executed_events >= config_.max_events) { // Silently skip - limit already reached - return false; + return; } if (!should_execute_trace(trace)) { result.filtered_events++; - return true; + return; } // Apply timing logic (skip during dry-run or dftracer-mode) @@ -550,6 +507,13 @@ bool ReplayEngine::process_trace_line(const std::string& line, apply_timing(trace); } + // Fidelity-observation point: callers can hook here to capture the + // wall-clock time at which each event is about to be dispatched and + // compare it against the trace timeline. No production paths set this. + if (config_.on_dispatch) { + config_.on_dispatch(trace, std::chrono::steady_clock::now()); + } + // Find and execute with appropriate executor TraceExecutor* executor = find_executor(trace); if (executor) { @@ -565,62 +529,57 @@ bool ReplayEngine::process_trace_line(const std::string& line, result.executed_events++; } else { result.failed_events++; - result.error_messages.push_back("Failed to execute " + - trace.func_name + " with " + - executor->get_name()); + std::string msg = "Failed to execute "; + msg.append(trace.func_name); + msg += " with "; + msg += executor->get_name(); + result.error_messages.push_back(std::move(msg)); } } else { result.failed_events++; if (config_.verbose) { DFTRACER_UTILS_LOG_DEBUG( - "No executor found for function: %s (category: %s)", - trace.func_name.c_str(), trace.cat.c_str()); + "No executor found for function: %.*s (category: %.*s)", + static_cast(trace.func_name.size()), + trace.func_name.data(), static_cast(trace.cat.size()), + trace.cat.data()); } } - - return true; } -bool ReplayEngine::parse_trace_json(const std::string& json_line, +bool ReplayEngine::parse_trace_json(common::json::JsonParser& parser, Trace& trace) { - const char* trimmed; - std::size_t trimmed_length; - if (!json_trim_and_validate(json_line.c_str(), json_line.length(), trimmed, - trimmed_length)) { - return false; - } - - yyjson_doc* doc = yyjson_read(trimmed, trimmed_length, 0); - if (!doc) { - return false; - } + composites::dft::DFTracerEvent ev; + // parse_ondemand returns false only when no "ph" was found; other fields + // are still populated. Match the legacy DOM-based behavior, which keyed + // validity on a non-empty name and treated missing ph as Regular. + composites::dft::DFTracerEvent::parse_ondemand(parser, ev); - yyjson_val* root = yyjson_doc_get_root(doc); - if (!root || !yyjson_is_obj(root)) { - yyjson_doc_free(doc); + if (ev.name.empty()) { return false; } - // Parse basic fields - trace.func_name = get_json_string(root, "name"); - trace.cat = get_json_string(root, "cat"); - std::string phase = get_json_string(root, "ph"); - - trace.pid = get_json_uint64(root, "pid"); - trace.tid = get_json_uint64(root, "tid"); - trace.time_start = get_json_uint64(root, "ts"); - trace.duration = get_json_double(root, "dur"); - trace.time_end = - trace.time_start + static_cast(trace.duration); - - // Parse arguments - trace.fhash = get_args_string(root, "fhash"); - trace.hhash = get_args_string(root, "hhash"); - trace.size = get_args_int64(root, "size", -1); - trace.offset = get_args_int64(root, "offset", -1); - - // Determine trace type - if (phase == "M") { + trace.func_name = intern_sv(ev.name); + trace.cat = intern_sv(ev.cat); + trace.pid = ev.pid; + trace.tid = ev.tid; + trace.time_start = ev.ts; + trace.duration = static_cast(ev.dur); + trace.time_end = trace.time_start + ev.dur; + + // ArgsValueProxy::get returns a view directly into the + // variant's owned string without copying; we then intern so the view + // outlives ev/ArgsMap (which die at the end of this function). + auto fhash_sv = ev.args["fhash"].get(std::string_view{}); + auto hhash_sv = ev.args["hhash"].get(std::string_view{}); + trace.fhash = fhash_sv.empty() ? std::string_view{} : intern_sv(fhash_sv); + trace.hhash = hhash_sv.empty() ? std::string_view{} : intern_sv(hhash_sv); + trace.size = + ev.args["size"].get(static_cast(-1)); + trace.offset = + ev.args["offset"].get(static_cast(-1)); + + if (ev.ph == "M") { if (trace.func_name == "FH") { trace.type = TraceType::FileHash; } else if (trace.func_name == "HH") { @@ -632,10 +591,8 @@ bool ReplayEngine::parse_trace_json(const std::string& json_line, trace.type = TraceType::Regular; } - trace.is_valid = !trace.func_name.empty(); - - yyjson_doc_free(doc); - return trace.is_valid; + trace.is_valid = true; + return true; } void ReplayEngine::apply_timing(const Trace& trace) { @@ -644,7 +601,16 @@ void ReplayEngine::apply_timing(const Trace& trace) { } if (!first_timestamp_set_) { + // Anchor BOTH clocks on the first event. The wall-clock anchor was + // initialized at engine construction time, but for any consumer + // path with warmup (e.g. Pipeline producer fills, channel hops), + // that anchor is "behind" by the warmup gap. Without resetting it + // here, the next event sees replay_elapsed >> trace_elapsed and + // we never sleep, collapsing the timing model. The trace-time + // anchor is set on first event regardless, so co-locating the + // wall-clock reset here keeps the two in lockstep. first_trace_timestamp_ = trace.time_start; + replay_start_time_ = std::chrono::steady_clock::now(); first_timestamp_set_ = true; return; } @@ -669,17 +635,16 @@ void ReplayEngine::apply_timing(const Trace& trace) { const std::uint64_t MAX_SLEEP_US = 10 * 1000 * 1000; if (sleep_us > MAX_SLEEP_US) { if (config_.verbose) { - std::cout << "Warning: Capping sleep from " - << static_cast(sleep_us) / 1000.0 << " ms to " - << MAX_SLEEP_US / 1000.0 << " ms" << std::endl; + std::printf("Warning: Capping sleep from %.3f ms to %.3f ms\n", + static_cast(sleep_us) / 1000.0, + static_cast(MAX_SLEEP_US) / 1000.0); } sleep_us = MAX_SLEEP_US; } if (config_.verbose && sleep_us > 1000) { - std::cout << "Timing sleep: " - << static_cast(sleep_us) / 1000.0 << " ms" - << std::endl; + std::printf("Timing sleep: %.3f ms\n", + static_cast(sleep_us) / 1000.0); } std::this_thread::sleep_for(std::chrono::microseconds(sleep_us)); @@ -735,15 +700,16 @@ bool ReplayEngine::should_execute_trace(const Trace& trace) const { return false; } - // Check function filters if (!config_.filter_functions.empty()) { - if (config_.filter_functions.find(trace.func_name) == + std::string key(trace.func_name); + if (config_.filter_functions.find(key) == config_.filter_functions.end()) { return false; } } if (!config_.exclude_functions.empty()) { - if (config_.exclude_functions.find(trace.func_name) != + std::string key(trace.func_name); + if (config_.exclude_functions.find(key) != config_.exclude_functions.end()) { return false; } @@ -751,13 +717,15 @@ bool ReplayEngine::should_execute_trace(const Trace& trace) const { // Check category filters if (!config_.filter_categories.empty()) { - if (config_.filter_categories.find(trace.cat) == + std::string key(trace.cat); + if (config_.filter_categories.find(key) == config_.filter_categories.end()) { return false; } } if (!config_.exclude_categories.empty()) { - if (config_.exclude_categories.find(trace.cat) != + std::string key(trace.cat); + if (config_.exclude_categories.find(key) != config_.exclude_categories.end()) { return false; } @@ -945,8 +913,8 @@ void ReplayEngine::replay_call_tree_node( ReplayResult& result) { // Convert CallTreeNodeInfo to Trace structure Trace trace; - trace.func_name = node.name; - trace.cat = node.category; + trace.func_name = intern_sv(node.name); + trace.cat = intern_sv(node.category); trace.time_start = node.start_time_us; trace.duration = static_cast(node.duration_us); trace.time_end = trace.time_start + node.duration_us; @@ -975,13 +943,13 @@ void ReplayEngine::replay_call_tree_node( } auto fhash_it = args.find("fhash"); - if (fhash_it != args.end()) { - trace.fhash = fhash_it->second; + if (fhash_it != args.end() && !fhash_it->second.empty()) { + trace.fhash = intern_sv(fhash_it->second); } auto hhash_it = args.find("hhash"); - if (hhash_it != args.end()) { - trace.hhash = hhash_it->second; + if (hhash_it != args.end() && !hhash_it->second.empty()) { + trace.hhash = intern_sv(hhash_it->second); } auto size_it = args.find("size"); @@ -1064,99 +1032,85 @@ void ReplayEngine::replay_call_tree_node( result.executed_events++; } else { result.failed_events++; - result.error_messages.push_back("Failed to execute " + - trace.func_name + " with " + - executor->get_name()); + std::string msg = "Failed to execute "; + msg.append(trace.func_name); + msg += " with "; + msg += executor->get_name(); + result.error_messages.push_back(std::move(msg)); } } else { result.failed_events++; if (config_.verbose) { DFTRACER_UTILS_LOG_DEBUG( - "No executor found for function: %s (category: %s)", - trace.func_name.c_str(), trace.cat.c_str()); + "No executor found for function: %.*s (category: %.*s)", + static_cast(trace.func_name.size()), + trace.func_name.data(), static_cast(trace.cat.size()), + trace.cat.data()); } } } -// ============================================================================= -// ReplayLineProcessor Implementation -// ============================================================================= - -ReplayLineProcessor::ReplayLineProcessor(ReplayEngine& engine, - ReplayResult& result) - : engine_(engine), result_(result) {} - -coro::CoroTask ReplayLineProcessor::process(const char* data, - std::size_t length) { - std::string line(data, length); - co_return engine_.process_trace_line(line, result_); -} - // ============================================================================= // ReplayResult::print_summary Implementation // ============================================================================= void ReplayResult::print_summary(bool verbose) const { - std::cout << "\n=== Replay Summary ===" << std::endl; - std::cout << "Total events: " << total_events << std::endl; - std::cout << "Executed: " << executed_events << std::endl; - std::cout << "Filtered: " << filtered_events << std::endl; - std::cout << "Failed: " << failed_events << std::endl; + std::printf("\n=== Replay Summary ===\n"); + std::printf("Total events: %zu\n", total_events); + std::printf("Executed: %zu\n", executed_events); + std::printf("Filtered: %zu\n", filtered_events); + std::printf("Failed: %zu\n", failed_events); double success_rate = total_events > 0 ? (static_cast(executed_events) / static_cast(total_events) * 100.0) : 0.0; - std::cout << "Success rate: " << std::fixed << std::setprecision(2) - << success_rate << "%" << std::endl; + std::printf("Success rate: %.2f%%\n", success_rate); - std::cout << "\nTiming:" << std::endl; - std::cout << " Total duration: " - << static_cast(total_duration.count()) / 1000.0 << " ms" - << std::endl; - std::cout << " Execution duration: " - << static_cast(execution_duration.count()) / 1000.0 - << " ms" << std::endl; + std::printf("\nTiming:\n"); + std::printf(" Total duration: %.3f ms\n", + static_cast(total_duration.count()) / 1000.0); + std::printf(" Execution duration: %.3f ms\n", + static_cast(execution_duration.count()) / 1000.0); if (first_timestamp != UINT64_MAX && last_timestamp > 0) { - std::cout << " Trace timespan: " - << static_cast(last_timestamp - first_timestamp) / - 1000000.0 - << " seconds" << std::endl; + std::printf( + " Trace timespan: %.6f seconds\n", + static_cast(last_timestamp - first_timestamp) / 1000000.0); } - std::cout << "\nI/O Statistics:" << std::endl; - std::cout << " Bytes read: " << total_bytes_read << " (" - << static_cast(total_bytes_read) / (1024.0 * 1024.0) - << " MB)" << std::endl; - std::cout << " Bytes written: " << total_bytes_written << " (" - << static_cast(total_bytes_written) / (1024.0 * 1024.0) - << " MB)" << std::endl; + std::printf("\nI/O Statistics:\n"); + std::printf(" Bytes read: %zu (%.2f MB)\n", total_bytes_read, + static_cast(total_bytes_read) / (1024.0 * 1024.0)); + std::printf(" Bytes written: %zu (%.2f MB)\n", total_bytes_written, + static_cast(total_bytes_written) / (1024.0 * 1024.0)); - std::cout << "\nProcess/Thread Statistics:" << std::endl; - std::cout << " Unique PIDs: " << pid_counts.size() << std::endl; - std::cout << " Unique TIDs: " << tid_counts.size() << std::endl; + std::printf("\nProcess/Thread Statistics:\n"); + std::printf(" Unique PIDs: %zu\n", pid_counts.size()); + std::printf(" Unique TIDs: %zu\n", tid_counts.size()); if (verbose) { if (!pid_counts.empty()) { - std::cout << "\n Events per PID:" << std::endl; + std::printf("\n Events per PID:\n"); for (const auto& [pid, count] : pid_counts) { - std::cout << " PID " << pid << ": " << count << " events" - << std::endl; + std::printf(" PID %u: %zu events\n", pid, count); } } if (!tid_counts.empty() && tid_counts.size() > 1) { - std::cout << "\n Events per TID:" << std::endl; + std::printf("\n Events per TID:\n"); for (const auto& [tid, count] : tid_counts) { - std::cout << " TID " << tid << ": " << count << " events" - << std::endl; + std::printf(" TID %u: %zu events\n", tid, count); } } if (!function_counts.empty()) { - std::cout << "\n Top functions by count:" << std::endl; - std::vector> sorted_funcs( + std::printf("\n Top functions by count:\n"); + // function_counts keys are string_views into the replay intern + // pool; sorting needs an indexable copy. Keep the views to avoid + // re-allocating strings for the dictionary entries (read, + // write, ...). + std::vector> sorted_funcs( function_counts.begin(), function_counts.end()); std::sort(sorted_funcs.begin(), sorted_funcs.end(), [](const auto& a, const auto& b) { @@ -1166,36 +1120,36 @@ void ReplayResult::print_summary(bool verbose) const { std::size_t max_display = std::min(sorted_funcs.size(), std::size_t(10)); for (std::size_t i = 0; i < max_display; i++) { - std::cout << " " << std::setw(30) << std::left - << sorted_funcs[i].first << ": " - << sorted_funcs[i].second << std::endl; + std::printf(" %-30.*s: %zu\n", + static_cast(sorted_funcs[i].first.size()), + sorted_funcs[i].first.data(), + sorted_funcs[i].second); } } if (!category_counts.empty()) { - std::cout << "\n Events per category:" << std::endl; + std::printf("\n Events per category:\n"); for (const auto& [cat, count] : category_counts) { - std::cout << " " << std::setw(20) << std::left << cat << ": " - << count << std::endl; + std::printf(" %-20.*s: %zu\n", static_cast(cat.size()), + cat.data(), count); } } } if (!error_messages.empty()) { - std::cout << "\n=== Errors (" << error_messages.size() - << " total) ===" << std::endl; + std::printf("\n=== Errors (%zu total) ===\n", error_messages.size()); std::size_t max_errors = std::min(error_messages.size(), std::size_t(10)); for (std::size_t i = 0; i < max_errors; i++) { - std::cout << " " << error_messages[i] << std::endl; + std::printf(" %s\n", error_messages[i].c_str()); } if (error_messages.size() > 10) { - std::cout << " ... and " << (error_messages.size() - 10) - << " more errors" << std::endl; + std::printf(" ... and %zu more errors\n", + error_messages.size() - 10); } } - std::cout << "=====================" << std::endl; + std::printf("=====================\n"); } } // namespace dftracer::utils::utilities::replay diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e23f12fb..f6c2ea51 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -57,6 +57,7 @@ set(TEST_CPP_SOURCES # reader/test_reader_tar_comprehensive.cpp # Replay tests replay/test_replay.cpp + replay/test_replay_fidelity.cpp # Server unit tests server/test_http_parser.cpp server/test_http_response.cpp @@ -115,7 +116,8 @@ foreach(test_file ${TEST_CPP_SOURCES}) if(bin_exec STREQUAL "pipeline/test_task_scope" OR bin_exec STREQUAL "coro/test_channel") set(heavy_thread_timeout 180) - if(DFTRACER_UTILS_ENABLE_TSAN OR DFTRACER_UTILS_ENABLE_COVERAGE) + if(DFTRACER_UTILS_ENABLE_TSAN OR DFTRACER_UTILS_ENABLE_COVERAGE + OR DFTRACER_UTILS_ENABLE_ASAN) set(heavy_thread_timeout 600) endif() set_tests_properties(${bin_exec} PROPERTIES @@ -206,8 +208,15 @@ set(TEST_BINARY_SOURCES binaries/test_dftracer_tar.cpp binaries/test_dftracer_replay.cpp binaries/test_dftracer_comparator.cpp + binaries/test_dftracer_call_tree.cpp ) +if(DFTRACER_UTILS_ENABLE_MPI) + list(APPEND TEST_BINARY_SOURCES + binaries/test_dftracer_aggregator_mpi.cpp + binaries/test_dftracer_call_tree_mpi.cpp) +endif() + foreach(test_file ${TEST_BINARY_SOURCES}) string(REPLACE ".cpp" "" bin_exec ${test_file}) string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" bin_exec ${bin_exec}) @@ -244,8 +253,25 @@ foreach(test_file ${TEST_BINARY_SOURCES}) if(DFTRACER_UTILS_ENABLE_TSAN) set(integration_test_timeout 180) endif() + # The MPI aggregator spawns mpiexec several times per test case and + # each child runs under ASan, which is ~10x slower than release. The + # default 120s is not enough. + if(bin_exec STREQUAL "binaries/test_dftracer_aggregator_mpi" OR + bin_exec STREQUAL "binaries/test_dftracer_call_tree_mpi") + set(integration_test_timeout 600) + endif() set_tests_properties(${bin_exec} PROPERTIES TIMEOUT ${integration_test_timeout}) + # dftracer_tar wraps the gzip indexer's parallel pipeline and has shown + # intermittent deadlocks on CI. Tar is a secondary code path (real + # workflows use directories of .pfw.gz). + if(bin_exec STREQUAL "binaries/test_dftracer_tar") + set_tests_properties(${bin_exec} PROPERTIES + TIMEOUT 240 + RUN_SERIAL TRUE + REPEAT "UNTIL_PASS:3") + endif() + # Pass binary paths so tests can find them if(bin_exec STREQUAL "binaries/test_dftracer_server") set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT @@ -291,10 +317,19 @@ foreach(test_file ${TEST_BINARY_SOURCES}) "DFTRACER_TAR_PATH=$") elseif(bin_exec STREQUAL "binaries/test_dftracer_replay") set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT - "DFTRACER_REPLAY_PATH=$") + "DFTRACER_REPLAY_PATH=$;ASAN_OPTIONS=detect_leaks=0;LSAN_OPTIONS=detect_leaks=0") elseif(bin_exec STREQUAL "binaries/test_dftracer_comparator") set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT "DFTRACER_COMPARATOR_PATH=$") + elseif(bin_exec STREQUAL "binaries/test_dftracer_call_tree") + set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT + "DFTRACER_CALL_TREE_PATH=$") + elseif(bin_exec STREQUAL "binaries/test_dftracer_call_tree_mpi") + set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT + "DFTRACER_CALL_TREE_PATH=$;DFTRACER_CALL_TREE_MPI_PATH=$;MPIEXEC_EXECUTABLE=${MPIEXEC_EXECUTABLE};ASAN_OPTIONS=detect_leaks=0;LSAN_OPTIONS=detect_leaks=0") + elseif(bin_exec STREQUAL "binaries/test_dftracer_aggregator_mpi") + set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT + "DFTRACER_AGGREGATOR_PATH=$;DFTRACER_AGGREGATOR_MPI_PATH=$;MPIEXEC_EXECUTABLE=${MPIEXEC_EXECUTABLE};ASAN_OPTIONS=detect_leaks=0;LSAN_OPTIONS=detect_leaks=0") endif() endforeach() diff --git a/tests/binaries/test_dftracer_aggregator_mpi.cpp b/tests/binaries/test_dftracer_aggregator_mpi.cpp new file mode 100644 index 00000000..cac9a571 --- /dev/null +++ b/tests/binaries/test_dftracer_aggregator_mpi.cpp @@ -0,0 +1,391 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// ============================================================================ +// Helpers +// ============================================================================ + +namespace { + +std::string create_pfw_gz(dft_utils_test::TestEnvironment& env, int num_events, + int id) { + auto trace_gz = env.create_dft_test_gzip_file(num_events); + if (trace_gz.empty()) return ""; + + std::string pfw_path = + env.get_dir() + "/trace_" + std::to_string(id) + ".pfw.gz"; + fs::rename(trace_gz, pfw_path); + return pfw_path; +} + +// Prefer env-provided path (set by CMake), fall back to the common build +// output locations. +std::string find_binary(const char* env_name, + const std::vector& candidates) { + const char* env_path = std::getenv(env_name); + if (env_path != nullptr && ::access(env_path, X_OK) == 0) return env_path; + for (const auto& path : candidates) { + if (::access(path.c_str(), X_OK) == 0) return path; + } + return ""; +} + +std::string find_serial_binary() { + return find_binary("DFTRACER_AGGREGATOR_PATH", + { + "./dftracer_aggregator", + "../dftracer_aggregator", + "../../dftracer_aggregator", + "../bin/dftracer_aggregator", + "../../bin/dftracer_aggregator", + }); +} + +std::string find_mpi_binary() { + return find_binary("DFTRACER_AGGREGATOR_MPI_PATH", + { + "./dftracer_aggregator_mpi", + "../dftracer_aggregator_mpi", + "../../dftracer_aggregator_mpi", + "../bin/dftracer_aggregator_mpi", + "../../bin/dftracer_aggregator_mpi", + }); +} + +// Locate an MPI launcher on $PATH. We prefer mpiexec to line up with the +// CMake `MPIEXEC_EXECUTABLE` default; fall back to mpirun. +std::string find_mpi_launcher() { + const char* env_path = std::getenv("MPIEXEC_EXECUTABLE"); + if (env_path != nullptr && ::access(env_path, X_OK) == 0) return env_path; + for (const auto& name : {"mpiexec", "mpirun"}) { + std::string cmd = std::string("command -v ") + name + " 2>/dev/null"; + FILE* p = ::popen(cmd.c_str(), "r"); + if (!p) continue; + char buf[4096]; + std::string out; + while (std::fgets(buf, sizeof(buf), p)) out += buf; + ::pclose(p); + while (!out.empty() && (out.back() == '\n' || out.back() == ' ')) + out.pop_back(); + if (!out.empty() && ::access(out.c_str(), X_OK) == 0) return out; + } + return ""; +} + +int run_process(const std::string& binary, + const std::vector& args) { + pid_t pid = ::fork(); + if (pid < 0) return -1; + if (pid == 0) { + std::vector argv; + argv.push_back(binary.c_str()); + for (const auto& arg : args) argv.push_back(arg.c_str()); + argv.push_back(nullptr); + ::execv(binary.c_str(), const_cast(argv.data())); + ::_exit(127); + } + int status = 0; + ::waitpid(pid, &status, 0); + if (WIFEXITED(status)) return WEXITSTATUS(status); + return -1; +} + +int run_mpi(const std::string& launcher, int np, const std::string& binary, + const std::vector& binary_args) { + std::vector args = {"--allow-run-as-root", "-n", + std::to_string(np), binary}; + for (const auto& a : binary_args) args.push_back(a); + return run_process(launcher, args); +} + +// Read a gzip-compressed file fully into memory (as a string). +std::string read_gz_to_string(const std::string& path) { + gzFile gz = gzopen(path.c_str(), "rb"); + if (!gz) return {}; + std::string out; + char buf[1 << 16]; + int n; + while ((n = gzread(gz, buf, sizeof(buf))) > 0) { + out.append(buf, static_cast(n)); + } + gzclose(gz); + return out; +} + +// Sort lines in `content` and return the sorted blob. Used to make output +// comparison independent of row ordering, which isn't guaranteed across +// rank counts (parallel scan traverses shard ranges in different orders). +std::string sort_lines(const std::string& content) { + std::vector lines; + std::istringstream ss(content); + std::string line; + while (std::getline(ss, line)) lines.push_back(std::move(line)); + std::sort(lines.begin(), lines.end()); + std::string out; + out.reserve(content.size()); + for (auto& l : lines) { + out += l; + out += '\n'; + } + return out; +} + +// Helper: read output and return sorted lines. Handles three shapes: +// 1. `path` itself ends with ".gz" -> decompress `path` +// 2. `path` does not end in ".gz" but a +// sibling `path.gz` exists -> decompress `path.gz` +// 3. otherwise -> read `path` as plain text +// The aggregator writes gzip when `-o foo.json.gz` is used but plain text +// when `-o foo.json` is used (without `--compress`). The MPI binary +// always gzips its final output. +std::string read_output_sorted(const std::string& path) { + const bool ends_with_gz = + path.size() >= 3 && path.compare(path.size() - 3, 3, ".gz") == 0; + if (ends_with_gz && fs::exists(path)) { + return sort_lines(read_gz_to_string(path)); + } + if (fs::exists(path + ".gz")) { + return sort_lines(read_gz_to_string(path + ".gz")); + } + std::ifstream ifs(path, std::ios::binary); + if (!ifs.is_open()) return {}; + std::ostringstream ss; + ss << ifs.rdbuf(); + return sort_lines(ss.str()); +} + +struct Env { + std::string serial_bin; + std::string mpi_bin; + std::string launcher; + bool ready = false; + std::string skip_reason; + + Env() { + serial_bin = find_serial_binary(); + mpi_bin = find_mpi_binary(); + launcher = find_mpi_launcher(); + if (serial_bin.empty()) { + skip_reason = "dftracer_aggregator binary not found"; + return; + } + if (mpi_bin.empty()) { + skip_reason = + "dftracer_aggregator_mpi binary not found (set " + "DFTRACER_AGGREGATOR_MPI_PATH or build with " + "DFTRACER_UTILS_ENABLE_MPI=ON)"; + return; + } + if (launcher.empty()) { + skip_reason = "no mpiexec/mpirun on PATH"; + return; + } + ready = true; + } +}; + +// Byte-copy helper -- input files must be byte-identical between the +// serial and MPI runs or their outputs will diverge (TestEnvironment +// seeds randomness internally and does not promise cross-instance +// reproducibility). +bool copy_file(const std::string& src, const std::string& dst) { + std::ifstream in(src, std::ios::binary); + std::ofstream out(dst, std::ios::binary); + if (!in.is_open() || !out.is_open()) return false; + out << in.rdbuf(); + return out.good(); +} + +// Compare two sorted blobs without blowing up doctest output: CHECK +// gets a bare bool so failure dumps just "false", and we print a +// compact summary (sizes + first-mismatch line) via MESSAGE. +void check_outputs_equal(const std::string& ser, const std::string& mpi) { + const bool equal = ser == mpi; + if (!equal) { + std::size_t diff_pos = 0; + const std::size_t n = std::min(ser.size(), mpi.size()); + while (diff_pos < n && ser[diff_pos] == mpi[diff_pos]) ++diff_pos; + auto snippet = [&](const std::string& s) -> std::string { + // Show up to 120 chars around the first differing byte. + if (s.empty()) return ""; + std::size_t start = diff_pos > 60 ? diff_pos - 60 : 0; + std::size_t len = std::min(120, s.size() - start); + return s.substr(start, len); + }; + MESSAGE("serial bytes=" << ser.size() << " mpi bytes=" << mpi.size() + << " first_diff_offset=" << diff_pos); + MESSAGE("serial near diff: " << snippet(ser)); + MESSAGE("mpi near diff: " << snippet(mpi)); + } + CHECK(equal); +} + +// Drive one parity test: generate fixtures, clone into sibling dirs, +// run serial vs MPI, return both sorted outputs. Empty pair on setup +// failure. +std::pair run_and_compare( + const Env& e, int mpi_ranks, int num_events, int num_files, + bool use_shared_staging = false) { + dft_utils_test::TestEnvironment src_env(100); + if (!src_env.is_valid()) return {}; + std::vector src_files; + for (int i = 0; i < num_files; ++i) { + auto f = create_pfw_gz(src_env, num_events, i); + if (f.empty()) return {}; + src_files.push_back(f); + } + + std::string ser_in = src_env.get_dir() + "/_ser_in"; + std::string mpi_in = src_env.get_dir() + "/_mpi_in"; + fs::create_directories(ser_in); + fs::create_directories(mpi_in); + for (const auto& f : src_files) { + std::string name = fs::path(f).filename().string(); + if (!copy_file(f, ser_in + "/" + name)) return {}; + if (!copy_file(f, mpi_in + "/" + name)) return {}; + } + + std::string ser_out = src_env.get_dir() + "/ser.json"; + std::string ser_idx = src_env.get_dir() + "/ser_idx"; + int rser = run_process(e.serial_bin, {"-d", ser_in, "--index-dir", ser_idx, + "-o", ser_out, "--force"}); + if (rser != 0) return {}; + + std::string mpi_out = src_env.get_dir() + "/mpi.json.gz"; + std::string mpi_idx = src_env.get_dir() + "/mpi_idx"; + std::string mpi_stg = src_env.get_dir() + "/mpi_stg"; + std::vector mpi_args = { + "-d", mpi_in, "--index-dir", mpi_idx, "--staging-dir", + mpi_stg, "-o", mpi_out, "--force"}; + if (use_shared_staging) { + // Force a distinct shared dir so Artifacts::move_to actually runs + // (proves aggregation_sst / system_metrics_sst survive the move). + mpi_args.push_back("--shared-staging"); + mpi_args.push_back(src_env.get_dir() + "/mpi_shared_stg"); + } + int rmpi = run_mpi(e.launcher, mpi_ranks, e.mpi_bin, mpi_args); + if (rmpi != 0) return {}; + + return {read_output_sorted(ser_out), read_output_sorted(mpi_out)}; +} + +} // namespace + +// ============================================================================ +// Integration tests +// ============================================================================ + +TEST_SUITE("DFTracerAggregatorMpi") { + TEST_CASE("binary exists") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + CHECK(!e.mpi_bin.empty()); + CHECK(!e.launcher.empty()); + } + + TEST_CASE("basic aggregation (n=1)") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + dft_utils_test::TestEnvironment env(100); + REQUIRE(env.is_valid()); + REQUIRE(!create_pfw_gz(env, 100, 0).empty()); + + std::string out = env.get_dir() + "/mpi_basic.json.gz"; + std::string idx = env.get_dir() + "/mpi_basic_idx"; + std::string stg = env.get_dir() + "/mpi_basic_stg"; + int rc = run_mpi(e.launcher, 1, e.mpi_bin, + {"-d", env.get_dir(), "--index-dir", idx, + "--staging-dir", stg, "-o", out, "--force"}); + CHECK(rc == 0); + CHECK(fs::exists(out)); + } + + // Serial vs MPI (n=1): bit-for-bit identical. No cross-rank splitting + // exercised; this is the canonical correctness guarantee for the + // MPI binary's single-rank mode. + TEST_CASE("serial parity (n=1)") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + auto [ser, mpi] = run_and_compare(e, /*mpi_ranks=*/1, + /*num_events=*/200, + /*num_files=*/1); + REQUIRE(!ser.empty()); + REQUIRE(!mpi.empty()); + check_outputs_equal(ser, mpi); + } + + // Serial vs MPI (n=4): bit-for-bit identical, including cross-rank + // splitting of a multi-member .pfw.gz. The power-sum MetricStats + // representation makes the merge order-independent. + TEST_CASE("serial parity (n=4, cross-rank splitting)") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + auto [ser, mpi] = run_and_compare(e, /*mpi_ranks=*/4, + /*num_events=*/5000, + /*num_files=*/1); + REQUIRE(!ser.empty()); + REQUIRE(!mpi.empty()); + check_outputs_equal(ser, mpi); + } + + // Multi-file parity: events spread across several input files with + // per-file LPT (no cross-rank splitting needed). Serial and MPI + // should still produce byte-identical output. + TEST_CASE("serial parity (n=2, multiple files)") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + auto [ser, mpi] = run_and_compare(e, /*mpi_ranks=*/2, + /*num_events=*/500, + /*num_files=*/3); + REQUIRE(!ser.empty()); + REQUIRE(!mpi.empty()); + check_outputs_equal(ser, mpi); + } + + // Shared-staging parity: forces the node-local -> shared-FS relocation + // path via Artifacts::move_to. Regression guard for the bug where + // aggregation_sst / system_metrics_sst were not listed in move_to and + // silently dropped during the move, leaving only root_process records + // in the final output. + TEST_CASE("serial parity (n=4, shared-staging move path)") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + auto [ser, mpi] = run_and_compare(e, /*mpi_ranks=*/4, + /*num_events=*/5000, + /*num_files=*/1, + /*use_shared_staging=*/true); + REQUIRE(!ser.empty()); + REQUIRE(!mpi.empty()); + check_outputs_equal(ser, mpi); + } +} diff --git a/tests/binaries/test_dftracer_call_tree.cpp b/tests/binaries/test_dftracer_call_tree.cpp new file mode 100644 index 00000000..7562acc9 --- /dev/null +++ b/tests/binaries/test_dftracer_call_tree.cpp @@ -0,0 +1,124 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace { + +std::string find_binary() { + const char* env_path = std::getenv("DFTRACER_CALL_TREE_PATH"); + if (env_path && ::access(env_path, X_OK) == 0) return env_path; + for (const auto& p : + {"./dftracer_call_tree", "../dftracer_call_tree", + "../../dftracer_call_tree", "../bin/dftracer_call_tree", + "../../bin/dftracer_call_tree"}) { + if (::access(p, X_OK) == 0) return p; + } + return ""; +} + +int run_process(const std::string& binary, + const std::vector& args) { + pid_t pid = ::fork(); + if (pid < 0) return -1; + if (pid == 0) { + std::vector argv; + argv.push_back(binary.c_str()); + for (const auto& a : args) argv.push_back(a.c_str()); + argv.push_back(nullptr); + ::execv(binary.c_str(), const_cast(argv.data())); + ::_exit(127); + } + int status = 0; + ::waitpid(pid, &status, 0); + return WIFEXITED(status) ? WEXITSTATUS(status) : -1; +} + +std::string create_pfw_gz(dft_utils_test::TestEnvironment& env, int num_events, + int id) { + auto trace_gz = env.create_dft_test_gzip_file(num_events); + if (trace_gz.empty()) return ""; + std::string path = + env.get_dir() + "/trace_" + std::to_string(id) + ".pfw.gz"; + fs::rename(trace_gz, path); + return path; +} + +// Counts non-bracket JSON lines and verifies the array opens with "[" and +// closes with "]". Returns -1 on shape error. +int count_events_basic(const std::string& path) { + std::ifstream f(path); + if (!f.is_open()) return -1; + std::string line; + bool saw_open = false, saw_close = false; + int events = 0; + while (std::getline(f, line)) { + if (line == "[") { + saw_open = true; + continue; + } + if (line == "]") { + saw_close = true; + continue; + } + if (!line.empty()) events++; + } + return (saw_open && saw_close) ? events : -1; +} + +} // namespace + +TEST_SUITE("DFTracerCallTree") { + TEST_CASE("binary exists") { + std::string bin = find_binary(); + if (bin.empty()) { + MESSAGE("skipping: dftracer_call_tree binary not found"); + return; + } + CHECK(!bin.empty()); + } + + TEST_CASE("basic run produces valid JSON") { + std::string bin = find_binary(); + if (bin.empty()) { + MESSAGE("skipping: binary not found"); + return; + } + dft_utils_test::TestEnvironment env(100); + REQUIRE(env.is_valid()); + REQUIRE(!create_pfw_gz(env, 100, 0).empty()); + + std::string out = env.get_dir() + "/ct.pfw"; + int rc = run_process(bin, {env.get_dir(), "-o", out}); + CHECK(rc == 0); + REQUIRE(fs::exists(out)); + CHECK(count_events_basic(out) > 0); + } + + TEST_CASE("multi-file input") { + std::string bin = find_binary(); + if (bin.empty()) { + MESSAGE("skipping: binary not found"); + return; + } + dft_utils_test::TestEnvironment env(100); + REQUIRE(env.is_valid()); + for (int i = 0; i < 3; ++i) { + REQUIRE(!create_pfw_gz(env, 200, i).empty()); + } + + std::string out = env.get_dir() + "/ct.pfw"; + int rc = run_process(bin, {env.get_dir(), "-o", out}); + CHECK(rc == 0); + REQUIRE(fs::exists(out)); + CHECK(count_events_basic(out) > 0); + } +} diff --git a/tests/binaries/test_dftracer_call_tree_mpi.cpp b/tests/binaries/test_dftracer_call_tree_mpi.cpp new file mode 100644 index 00000000..99ab7f8d --- /dev/null +++ b/tests/binaries/test_dftracer_call_tree_mpi.cpp @@ -0,0 +1,281 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +std::string find_binary(const char* env_name, + const std::vector& candidates) { + const char* env_path = std::getenv(env_name); + if (env_path && ::access(env_path, X_OK) == 0) return env_path; + for (const auto& p : candidates) { + if (::access(p.c_str(), X_OK) == 0) return p; + } + return ""; +} + +std::string find_serial_binary() { + return find_binary("DFTRACER_CALL_TREE_PATH", + {"./dftracer_call_tree", "../dftracer_call_tree", + "../../dftracer_call_tree", "../bin/dftracer_call_tree", + "../../bin/dftracer_call_tree"}); +} + +std::string find_mpi_binary() { + return find_binary( + "DFTRACER_CALL_TREE_MPI_PATH", + {"./dftracer_call_tree_mpi", "../dftracer_call_tree_mpi", + "../../dftracer_call_tree_mpi", "../bin/dftracer_call_tree_mpi", + "../../bin/dftracer_call_tree_mpi"}); +} + +std::string find_launcher() { + const char* env_path = std::getenv("MPIEXEC_EXECUTABLE"); + if (env_path && ::access(env_path, X_OK) == 0) return env_path; + for (const auto& name : {"mpiexec", "mpirun"}) { + std::string cmd = std::string("command -v ") + name + " 2>/dev/null"; + FILE* p = ::popen(cmd.c_str(), "r"); + if (!p) continue; + char buf[4096]; + std::string out; + while (std::fgets(buf, sizeof(buf), p)) out += buf; + ::pclose(p); + while (!out.empty() && (out.back() == '\n' || out.back() == ' ')) + out.pop_back(); + if (!out.empty() && ::access(out.c_str(), X_OK) == 0) return out; + } + return ""; +} + +int run_process(const std::string& binary, + const std::vector& args) { + pid_t pid = ::fork(); + if (pid < 0) return -1; + if (pid == 0) { + std::vector argv; + argv.push_back(binary.c_str()); + for (const auto& a : args) argv.push_back(a.c_str()); + argv.push_back(nullptr); + ::execv(binary.c_str(), const_cast(argv.data())); + ::_exit(127); + } + int status = 0; + ::waitpid(pid, &status, 0); + return WIFEXITED(status) ? WEXITSTATUS(status) : -1; +} + +int run_mpi(const std::string& launcher, int np, const std::string& binary, + const std::vector& binary_args) { + std::vector args = {"--allow-run-as-root", "-n", + std::to_string(np), binary}; + for (const auto& a : binary_args) args.push_back(a); + return run_process(launcher, args); +} + +std::string create_pfw_gz(dft_utils_test::TestEnvironment& env, int num_events, + int id) { + auto trace_gz = env.create_dft_test_gzip_file(num_events); + if (trace_gz.empty()) return ""; + std::string path = + env.get_dir() + "/trace_" + std::to_string(id) + ".pfw.gz"; + fs::rename(trace_gz, path); + return path; +} + +bool copy_file(const std::string& src, const std::string& dst) { + std::ifstream in(src, std::ios::binary); + std::ofstream out(dst, std::ios::binary); + if (!in.is_open() || !out.is_open()) return false; + out << in.rdbuf(); + return out.good(); +} + +// Strip the "id":, prefix from a Chrome Tracing event line. Event id +// differs between serial (sequential) and MPI (rank-base + slice stride) +// runs even when the underlying events are identical, so we compare the +// remaining fields. Non-event lines (header brackets) pass through. +std::string strip_event_id(const std::string& line) { + static const std::string prefix = "{\"id\":"; + if (line.compare(0, prefix.size(), prefix) != 0) return line; + std::size_t comma = line.find(',', prefix.size()); + if (comma == std::string::npos) return line; + return std::string("{") + line.substr(comma + 1); +} + +std::vector read_event_lines_sorted(const std::string& path) { + std::vector lines; + std::ifstream f(path); + if (!f.is_open()) return lines; + std::string line; + while (std::getline(f, line)) { + if (line.empty() || line == "[" || line == "]") continue; + // Drop trailing comma so equivalent events match. + if (!line.empty() && line.back() == ',') line.pop_back(); + // Metadata events ({"name":"M",...}) embed wall-clock timestamp and + // change between independent process invocations. Skip them; we + // only compare actual tree events. + static const std::string meta_prefix = "{\"name\":\"M\""; + if (line.compare(0, meta_prefix.size(), meta_prefix) == 0) continue; + lines.push_back(strip_event_id(line)); + } + std::sort(lines.begin(), lines.end()); + return lines; +} + +struct Env { + std::string serial_bin, mpi_bin, launcher; + bool ready = false; + std::string skip_reason; + + Env() { + serial_bin = find_serial_binary(); + mpi_bin = find_mpi_binary(); + launcher = find_launcher(); + if (serial_bin.empty()) { + skip_reason = "dftracer_call_tree binary not found"; + return; + } + if (mpi_bin.empty()) { + skip_reason = "dftracer_call_tree_mpi binary not found"; + return; + } + if (launcher.empty()) { + skip_reason = "no mpiexec/mpirun on PATH"; + return; + } + ready = true; + } +}; + +std::pair, std::vector> run_and_compare( + const Env& e, int mpi_ranks, int num_events, int num_files) { + dft_utils_test::TestEnvironment src(100); + if (!src.is_valid()) return {}; + std::vector srcs; + for (int i = 0; i < num_files; ++i) { + auto p = create_pfw_gz(src, num_events, i); + if (p.empty()) return {}; + srcs.push_back(p); + } + + // Identical input dirs for serial and MPI runs. + std::string ser_in = src.get_dir() + "/_ser_in"; + std::string mpi_in = src.get_dir() + "/_mpi_in"; + fs::create_directories(ser_in); + fs::create_directories(mpi_in); + for (const auto& f : srcs) { + auto name = fs::path(f).filename().string(); + if (!copy_file(f, ser_in + "/" + name)) return {}; + if (!copy_file(f, mpi_in + "/" + name)) return {}; + } + + std::string ser_out = src.get_dir() + "/ser.pfw"; + int rs = run_process(e.serial_bin, {ser_in, "-o", ser_out}); + if (rs != 0) return {}; + + std::string mpi_out = src.get_dir() + "/mpi.pfw"; + std::string mpi_stg = src.get_dir() + "/mpi_stg"; + int rm = run_mpi(e.launcher, mpi_ranks, e.mpi_bin, + {mpi_in, "-o", mpi_out, "--staging-dir", mpi_stg}); + if (rm != 0) return {}; + + return {read_event_lines_sorted(ser_out), read_event_lines_sorted(mpi_out)}; +} + +void check_parity(const std::vector& ser, + const std::vector& mpi) { + const bool equal = ser == mpi; + if (!equal) { + MESSAGE("serial events=" << ser.size() << " mpi events=" << mpi.size()); + std::size_t shown = 0; + for (std::size_t i = 0; + i < std::min(ser.size(), mpi.size()) && shown < 3; ++i) { + if (ser[i] != mpi[i]) { + MESSAGE("first diff at " << i); + MESSAGE(" ser: " << ser[i]); + MESSAGE(" mpi: " << mpi[i]); + ++shown; + } + } + } + CHECK(equal); +} + +} // namespace + +TEST_SUITE("DFTracerCallTreeMpi") { + TEST_CASE("binary exists") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + CHECK(!e.mpi_bin.empty()); + CHECK(!e.launcher.empty()); + } + + TEST_CASE("basic MPI run (n=1)") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + dft_utils_test::TestEnvironment env(100); + REQUIRE(env.is_valid()); + REQUIRE(!create_pfw_gz(env, 100, 0).empty()); + std::string out = env.get_dir() + "/mpi.pfw"; + std::string stg = env.get_dir() + "/stg"; + int rc = run_mpi(e.launcher, 1, e.mpi_bin, + {env.get_dir(), "-o", out, "--staging-dir", stg}); + CHECK(rc == 0); + CHECK(fs::exists(out)); + } + + TEST_CASE("serial parity (n=1)") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + auto [s, m] = run_and_compare(e, 1, 200, 1); + REQUIRE(!s.empty()); + REQUIRE(!m.empty()); + check_parity(s, m); + } + + TEST_CASE("serial parity (n=2 multi-file)") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + auto [s, m] = run_and_compare(e, 2, 200, 4); + REQUIRE(!s.empty()); + REQUIRE(!m.empty()); + check_parity(s, m); + } + + TEST_CASE("serial parity (n=4 multi-file)") { + Env e; + if (!e.ready) { + MESSAGE("skipping: " << e.skip_reason); + return; + } + auto [s, m] = run_and_compare(e, 4, 300, 8); + REQUIRE(!s.empty()); + REQUIRE(!m.empty()); + check_parity(s, m); + } +} diff --git a/tests/binaries/test_dftracer_comparator.cpp b/tests/binaries/test_dftracer_comparator.cpp index 0f599fc0..57a9c2b3 100644 --- a/tests/binaries/test_dftracer_comparator.cpp +++ b/tests/binaries/test_dftracer_comparator.cpp @@ -1,10 +1,10 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include +#include #include #include #include -#include #include #include @@ -264,72 +264,75 @@ TEST_SUITE("DFTracerComparator") { REQUIRE(!content.empty()); // Parse JSON - yyjson_doc* doc = yyjson_read(content.c_str(), content.size(), 0); - REQUIRE(doc != nullptr); - yyjson_val* root = yyjson_doc_get_root(doc); - REQUIRE(root != nullptr); - REQUIRE(yyjson_is_obj(root)); + simdjson::dom::parser parser; + auto result = parser.parse(content); + REQUIRE(!result.error()); + auto root = result.value_unsafe(); + REQUIRE(root.is_object()); // Top-level fields - CHECK(yyjson_is_str(yyjson_obj_get(root, "baseline"))); - CHECK(yyjson_is_str(yyjson_obj_get(root, "variant"))); - CHECK(yyjson_is_obj(yyjson_obj_get(root, "baseline_meta"))); - CHECK(yyjson_is_obj(yyjson_obj_get(root, "variant_meta"))); - CHECK(yyjson_is_num(yyjson_obj_get(root, "execution_time_ms"))); + CHECK(root["baseline"].is_string()); + CHECK(root["variant"].is_string()); + CHECK(root["baseline_meta"].is_object()); + CHECK(root["variant_meta"].is_object()); + CHECK(root["execution_time_ms"].is_number()); // Nodes array - yyjson_val* nodes = yyjson_obj_get(root, "nodes"); - REQUIRE(yyjson_is_arr(nodes)); - REQUIRE(yyjson_arr_size(nodes) > 0); + auto nodes = root["nodes"]; + REQUIRE(!nodes.error()); + REQUIRE(nodes.is_array()); + auto nodes_arr = nodes.get_array().value_unsafe(); + REQUIRE(nodes_arr.size() > 0); // First node structure - yyjson_val* node0 = yyjson_arr_get_first(nodes); - REQUIRE(yyjson_is_obj(node0)); - CHECK(yyjson_is_str(yyjson_obj_get(node0, "name"))); - CHECK(yyjson_is_str(yyjson_obj_get(node0, "query"))); + auto node0 = nodes_arr.at(0); + REQUIRE(node0.is_object()); + CHECK(node0["name"].is_string()); + CHECK(node0["query"].is_string()); // Summary - yyjson_val* summary = yyjson_obj_get(node0, "summary"); - REQUIRE(yyjson_is_obj(summary)); - yyjson_val* sum_metrics = yyjson_obj_get(summary, "metrics"); - REQUIRE(yyjson_is_arr(sum_metrics)); - REQUIRE(yyjson_arr_size(sum_metrics) > 0); + auto summary = node0["summary"]; + REQUIRE(!summary.error()); + REQUIRE(summary.is_object()); + auto sum_metrics = summary["metrics"]; + REQUIRE(!sum_metrics.error()); + REQUIRE(sum_metrics.is_array()); + auto sum_metrics_arr = sum_metrics.get_array().value_unsafe(); + REQUIRE(sum_metrics_arr.size() > 0); // First metric structure - yyjson_val* metric0 = yyjson_arr_get_first(sum_metrics); - REQUIRE(yyjson_is_obj(metric0)); - CHECK(yyjson_is_str(yyjson_obj_get(metric0, "name"))); - CHECK(yyjson_is_num(yyjson_obj_get(metric0, "baseline"))); - CHECK(yyjson_is_num(yyjson_obj_get(metric0, "variant"))); - CHECK(yyjson_is_num(yyjson_obj_get(metric0, "delta"))); - CHECK(yyjson_is_num(yyjson_obj_get(metric0, "pct_change"))); - CHECK(yyjson_is_num(yyjson_obj_get(metric0, "cohens_d"))); - CHECK(yyjson_is_str(yyjson_obj_get(metric0, "significance"))); - CHECK(yyjson_is_bool(yyjson_obj_get(metric0, "is_regression"))); + auto metric0 = sum_metrics_arr.at(0); + REQUIRE(metric0.is_object()); + CHECK(metric0["name"].is_string()); + CHECK(metric0["baseline"].is_number()); + CHECK(metric0["variant"].is_number()); + CHECK(metric0["delta"].is_number()); + CHECK(metric0["pct_change"].is_number()); + CHECK(metric0["cohens_d"].is_number()); + CHECK(metric0["significance"].is_string()); + CHECK(metric0["is_regression"].is_bool()); // Groups array exists - yyjson_val* groups = yyjson_obj_get(node0, "groups"); - CHECK(yyjson_is_arr(groups)); + CHECK(node0["groups"].is_array()); // Children array exists - yyjson_val* children = yyjson_obj_get(node0, "children"); - CHECK(yyjson_is_arr(children)); + CHECK(node0["children"].is_array()); // Metadata objects - yyjson_val* base_meta = yyjson_obj_get(root, "baseline_meta"); - REQUIRE(yyjson_is_obj(base_meta)); - CHECK(yyjson_is_int(yyjson_obj_get(base_meta, "files"))); - CHECK(yyjson_is_int(yyjson_obj_get(base_meta, "processes"))); - CHECK(yyjson_is_int(yyjson_obj_get(base_meta, "threads"))); - CHECK(yyjson_is_num(yyjson_obj_get(base_meta, "total_bytes"))); - CHECK(yyjson_is_num(yyjson_obj_get(base_meta, "total_io_time_us"))); - CHECK(yyjson_is_num(yyjson_obj_get(base_meta, "makespan_us"))); - - yyjson_val* var_meta = yyjson_obj_get(root, "variant_meta"); - REQUIRE(yyjson_is_obj(var_meta)); - CHECK(yyjson_is_int(yyjson_obj_get(var_meta, "files"))); - - yyjson_doc_free(doc); + auto base_meta = root["baseline_meta"]; + REQUIRE(!base_meta.error()); + REQUIRE(base_meta.is_object()); + CHECK(base_meta["files"].is_number()); + CHECK(base_meta["processes"].is_number()); + CHECK(base_meta["threads"].is_number()); + CHECK(base_meta["total_bytes"].is_number()); + CHECK(base_meta["total_io_time_us"].is_number()); + CHECK(base_meta["makespan_us"].is_number()); + + auto var_meta = root["variant_meta"]; + REQUIRE(!var_meta.error()); + REQUIRE(var_meta.is_object()); + CHECK(var_meta["files"].is_number()); } TEST_CASE("json output - same file deltas are zero") { @@ -355,24 +358,21 @@ TEST_SUITE("DFTracerComparator") { auto content = read_file(output); REQUIRE(!content.empty()); - yyjson_doc* doc = yyjson_read(content.c_str(), content.size(), 0); - REQUIRE(doc != nullptr); - yyjson_val* root = yyjson_doc_get_root(doc); - yyjson_val* nodes = yyjson_obj_get(root, "nodes"); - yyjson_val* node0 = yyjson_arr_get_first(nodes); - yyjson_val* summary = yyjson_obj_get(node0, "summary"); - yyjson_val* metrics = yyjson_obj_get(summary, "metrics"); + simdjson::dom::parser parser; + auto result = parser.parse(content); + REQUIRE(!result.error()); + auto root = result.value_unsafe(); + auto nodes_arr = root["nodes"].get_array().value_unsafe(); + auto node0 = nodes_arr.at(0); + auto metrics_arr = + node0["summary"]["metrics"].get_array().value_unsafe(); // All deltas should be ~0 when comparing same file - std::size_t idx, max; - yyjson_val* m; - yyjson_arr_foreach(metrics, idx, max, m) { - double baseline = yyjson_get_real(yyjson_obj_get(m, "baseline")); - double variant = yyjson_get_real(yyjson_obj_get(m, "variant")); + for (auto m : metrics_arr) { + double baseline = m["baseline"].get_double().value(); + double variant = m["variant"].get_double().value(); CHECK(baseline == doctest::Approx(variant).epsilon(0.01)); } - - yyjson_doc_free(doc); } TEST_CASE("custom time interval") { diff --git a/tests/binaries/test_dftracer_gen_fake_trace.cpp b/tests/binaries/test_dftracer_gen_fake_trace.cpp index b8aaa429..6eab51c0 100644 --- a/tests/binaries/test_dftracer_gen_fake_trace.cpp +++ b/tests/binaries/test_dftracer_gen_fake_trace.cpp @@ -151,10 +151,10 @@ TEST_SUITE("DFTracerGenFakeTrace") { std::string rank0 = out_dir + "/rank_0.pfw.gz"; REQUIRE(fs::exists(rank0)); - // DFTracer events are JSON objects; first line starts with '{'. + // First line is the opening JSON array bracket. auto first = gz_first_line(rank0); REQUIRE(!first.empty()); - CHECK(first.front() == '{'); + CHECK(first.front() == '['); } TEST_CASE("deterministic output with fixed seed") { diff --git a/tests/binaries/test_dftracer_organize.cpp b/tests/binaries/test_dftracer_organize.cpp index e45afdf6..642475b2 100644 --- a/tests/binaries/test_dftracer_organize.cpp +++ b/tests/binaries/test_dftracer_organize.cpp @@ -289,9 +289,10 @@ TEST_SUITE("DFTracerOrganize") { fs::create_directories(org_dir); fs::create_directories(rec_dir); - int rc_org = - run_binary(org_binary, {"-d", env.get_dir(), "-o", org_dir, - "--groups", R"(io:cat == "POSIX")"}); + // Route all events (POSIX and STDIO) to properly test round-trip + int rc_org = run_binary(org_binary, + {"-d", env.get_dir(), "-o", org_dir, "--groups", + R"(io:cat == "POSIX" || cat == "STDIO")"}); REQUIRE(rc_org == 0); int rc_rec = run_binary( diff --git a/tests/binaries/test_dftracer_server.cpp b/tests/binaries/test_dftracer_server.cpp index 292f4097..fe17c9dd 100644 --- a/tests/binaries/test_dftracer_server.cpp +++ b/tests/binaries/test_dftracer_server.cpp @@ -112,7 +112,7 @@ bool wait_for_port(int port, int timeout_s = 10) { /// Send a raw HTTP request and receive the response. std::string http_request(int port, const std::string& request, - int recv_timeout_s = 2) { + int recv_timeout_s = 15) { int sock = ::socket(AF_INET, SOCK_STREAM, 0); if (sock < 0) return ""; diff --git a/tests/pipeline/test_coro_scope.cpp b/tests/pipeline/test_coro_scope.cpp index 9f1ae876..f95f5e53 100644 --- a/tests/pipeline/test_coro_scope.cpp +++ b/tests/pipeline/test_coro_scope.cpp @@ -123,7 +123,7 @@ TEST_CASE("CoroScope - Producer-consumer with channel (shared_ptr)") { auto channel = coro::make_channel(16); co_await ctx.coro_scope( - [&sum, channel](CoroScope& scope) -> coro::CoroTask { + [&sum, &channel](CoroScope& scope) -> coro::CoroTask { scope.spawn_producer( channel, [](CoroScope&) -> coro::Generator { for (int i = 1; i <= 10; ++i) { @@ -164,8 +164,8 @@ TEST_CASE("CoroScope - Transform pipeline") { auto output = coro::make_channel(16); co_await ctx.coro_scope( - [&sum, input, - output](CoroScope& scope) -> coro::CoroTask { + [&sum, &input, + &output](CoroScope& scope) -> coro::CoroTask { // Producer: 1..5 scope.spawn_producer( input, [](CoroScope&) -> coro::Generator { @@ -242,12 +242,12 @@ TEST_CASE("CoroScope - spawn_producers (N producers, shared_ptr)") { auto channel = coro::make_channel(32); co_await ctx.coro_scope( - [&sum, channel](CoroScope& scope) -> coro::CoroTask { + [&sum, &channel](CoroScope& scope) -> coro::CoroTask { // 3 producers, each sends its index scope.spawn_producers( channel, 3, - [channel](CoroScope&, - std::size_t idx) -> coro::CoroTask { + [&channel](CoroScope&, + std::size_t idx) -> coro::CoroTask { int val = static_cast(idx + 1); co_await channel->send(val); co_return; diff --git a/tests/python/common.py b/tests/python/common.py index 82f4aa15..8f1dad55 100644 --- a/tests/python/common.py +++ b/tests/python/common.py @@ -11,7 +11,7 @@ import pytest -import dftracer.utils as dft_utils +from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer def determine_index_path(file_path: str, index_dir: str = "") -> str: @@ -208,6 +208,106 @@ def create_test_gzip_file_with_nested_json(self): self.test_files.append(file_path) return file_path + def create_varying_schema_file(self, filename="varying_schema.pfw.gz", num_events=500): + """Create events with varying schemas to test elastic Arrow schema. + + Some events have extra fields (offset, whence, size) that others don't. + This tests that the Arrow writer handles schema evolution correctly. + """ + file_path = os.path.join(self.temp_dir, filename) + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + with gzip.open(file_path, "wt", encoding="utf-8") as f: + f.write("[\n") + for i in range(num_events): + name = ["read", "write", "open", "close", "stat"][i % 5] + cat = "POSIX" + + event = { + "name": name, + "cat": cat, + "pid": 1000 + i % 4, + "tid": 2000 + i % 8, + "ts": 1000000 + i * 1000, + "dur": (i * 123) % 10000, + "ph": "X", + "args": {"ret": 1024 * i, "hhash": f"hash_{i}"}, + } + + if name == "read" or name == "write": + event["args"]["offset"] = i * 4096 + event["args"]["size"] = 4096 + + if name == "open": + event["args"]["flags"] = "O_RDONLY" + event["args"]["mode"] = 0o644 + + if name == "stat": + event["args"]["path"] = f"/tmp/file_{i}.txt" + + if i % 7 == 0: + event["args"]["extra_field"] = f"extra_{i}" + + if i % 11 == 0: + event["args"]["rare_field"] = i * 1000 + + import json + + f.write(json.dumps(event, separators=(",", ":")) + "\n") + f.write("]\n") + + self.test_files.append(file_path) + return file_path + + def create_dft_trace_file_with_pid(self, filename, pid, num_events=None): + """Create a DFTracer trace with a specific PID, hash metadata, and proper aggregation fields.""" + file_path = os.path.join(self.temp_dir, filename) + os.makedirs(os.path.dirname(file_path), exist_ok=True) + n = num_events if num_events is not None else self.lines + io_names = ["read", "write", "open", "close", "pread", "pwrite", "fread", "fwrite"] + cats = ["POSIX", "POSIX", "POSIX", "POSIX", "POSIX", "POSIX", "STDIO", "STDIO"] + hhash = f"h{pid}" + fhash = f"f{pid}" + with gzip.open(file_path, "wt", encoding="utf-8") as f: + f.write( + f'{{"name":"HH","ph":"M","pid":{pid},"tid":1,"args":{{"name":"host{pid}","value":"{hhash}"}}}}\n' + ) + f.write( + f'{{"name":"FH","ph":"M","pid":{pid},"tid":1,"args":{{"name":"/data/file{pid}.dat","value":"{fhash}"}}}}\n' + ) + for i in range(n): + name = io_names[i % len(io_names)] + cat = cats[i % len(cats)] + f.write( + f'{{"name":"{name}","cat":"{cat}","pid":{pid},"tid":{1 + i % 3},' + f'"ts":{1000000 + i * 1000},"dur":{100 + i * 10},' + f'"ph":"X","args":{{"ret":{1024 * (i + 1)},"hhash":"{hhash}","fhash":"{fhash}"}}}}\n' + ) + self.test_files.append(file_path) + return file_path + + def create_indexed_traces(self, pids=None, num_events=None): + """Create trace files and build full index with aggregation. + + Returns the temp directory path (use as directory= for Indexer). + """ + from dftracer.utils import AggregationConfig, Indexer + + if pids is None: + pids = [1] + files = [] + for pid in pids: + files.append( + self.create_dft_trace_file_with_pid(f"trace_p{pid}.pfw.gz", pid, num_events) + ) + indexer = Indexer( + files=files, + require_aggregation=AggregationConfig(time_interval_ms=5000), + force_rebuild=True, + ) + indexer.ensure_indexed() + return self.temp_dir + def get_index_path(self, gz_file_path): """Get the `.dftindex` path for a gzip file.""" return determine_index_path(gz_file_path, "") @@ -220,7 +320,7 @@ def build_index(self, gz_file_path, checkpoint_size_bytes=None): index_path = self.get_index_path(gz_file_path) try: - with dft_utils.Indexer(gz_file_path, index_path, checkpoint_size_bytes) as indexer: + with NativeIndexer(gz_file_path, index_path, checkpoint_size_bytes) as indexer: if indexer.need_rebuild(): indexer.build() @@ -236,7 +336,7 @@ def create_indexer(self, gz_file_path, checkpoint_size_bytes=None): checkpoint_size_bytes = 32 * 1024 * 1024 # 32MB default try: - indexer = dft_utils.Indexer(gz_file_path, checkpoint_size=checkpoint_size_bytes) + indexer = NativeIndexer(gz_file_path, checkpoint_size=checkpoint_size_bytes) if indexer.need_rebuild(): indexer.build() return indexer diff --git a/tests/python/test_aggregator.py b/tests/python/test_aggregator.py index 1bcd9c4b..3e3a957e 100644 --- a/tests/python/test_aggregator.py +++ b/tests/python/test_aggregator.py @@ -1,5 +1,6 @@ """Tests for AggregatorUtility.""" +import gzip from pathlib import Path from typing import Dict, Tuple @@ -38,8 +39,8 @@ class TestAggregatorUtility: @staticmethod def _write_mixed_counter_trace(env: Environment) -> str: - path = Path(env.temp_dir) / "mixed_trace.pfw" - path.write_text( + path = Path(env.temp_dir) / "mixed_trace.pfw.gz" + content = ( "\n".join( [ '{"name":"read","cat":"POSIX","pid":7,"tid":3,"ts":1000,"dur":50,"ph":"X","args":{"ret":64,"bytes":64,"hhash":"event_h","fhash":"event_f"}}', @@ -47,9 +48,10 @@ def _write_mixed_counter_trace(env: Environment) -> str: '{"name":"mem_bw","cat":"sys","pid":7,"tid":3,"ts":2500,"dur":0,"ph":"C","args":{"count":2,"dur_sum":40,"dur_min":15,"dur_max":25,"ret_sum":600,"ret_min":250,"ret_max":350,"bytes_sum":1200,"bytes_min":500,"bytes_max":700,"hhash":"system_h","fhash":"system_f"}}', ] ) - + "\n", - encoding="utf-8", + + "\n" ) + with gzip.open(path, "wt", encoding="utf-8") as f: + f.write(content) env.test_files.append(str(path)) return str(path) @@ -198,17 +200,18 @@ def test_iter_arrow_emits_separate_event_profile_and_system_batches(self): def test_process_unions_group_keys_and_custom_metric_columns(self): with Environment(lines=0) as env: - path = Path(env.temp_dir) / "mixed_schema_trace.pfw" - path.write_text( + path = Path(env.temp_dir) / "mixed_schema_trace.pfw.gz" + content = ( "\n".join( [ '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"epoch":"1","bytes":4,"hhash":"h1"}}', '{"name":"write","cat":"POSIX","pid":1,"tid":1,"ts":2000,"dur":20,"ph":"X","args":{"ret":8,"step":"2","ops":3,"hhash":"h2"}}', ] ) - + "\n", - encoding="utf-8", + + "\n" ) + with gzip.open(path, "wt", encoding="utf-8") as f: + f.write(content) env.test_files.append(str(path)) table = AggregatorUtility().process( @@ -235,3 +238,279 @@ def test_process_unions_group_keys_and_custom_metric_columns(self): assert by_name["write"]["step"] == "2" assert by_name["write"]["bytes_total"] is None assert by_name["write"]["ops_total"] == 3 + + def test_time_interval_expansion_adds_ci_columns(self): + """When querying with smaller interval than stored, CI columns appear.""" + with Environment(lines=0) as env: + # Create trace with events spread across time + path = Path(env.temp_dir) / "time_interval_trace.pfw.gz" + content = ( + "\n".join( + [ + '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}', + '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":2000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}', + '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":3000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}', + '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":4000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}', + ] + ) + + "\n" + ) + with gzip.open(path, "wt", encoding="utf-8") as f: + f.write(content) + env.test_files.append(str(path)) + + # First, build index with large time interval (5000ms) + _ = AggregatorUtility().process( + env.temp_dir, + index_dir=env.temp_dir, + force_rebuild=True, + time_interval_ms=5000.0, + ) + + # Query with smaller interval (1000ms) - should trigger expansion + result = AggregatorUtility().process( + env.temp_dir, + index_dir=env.temp_dir, + force_rebuild=False, + time_interval_ms=1000.0, + ) + + pa = pytest.importorskip("pyarrow") + batches = [pa.record_batch(batch) for batch in result.batches()] + assert len(batches) > 0 + + # CI columns should be present when expansion happened + schema_names = set(batches[0].schema.names) + assert "count_ci_lower" in schema_names, ( + f"Missing count_ci_lower. Schema: {schema_names}" + ) + assert "count_ci_upper" in schema_names, ( + f"Missing count_ci_upper. Schema: {schema_names}" + ) + + # Verify CI values are sensible (upper >= count >= lower) + rows = pa.Table.from_batches(batches).to_pylist() + for row in rows: + assert row["count_ci_lower"] <= row["count"] + assert row["count_ci_upper"] >= row["count"] + + def test_time_interval_shrink_no_ci_columns(self): + """When querying with larger interval than stored, no CI columns (lossless).""" + with Environment(lines=0) as env: + path = Path(env.temp_dir) / "shrink_trace.pfw.gz" + content = ( + "\n".join( + [ + '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}', + '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":6000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}', + ] + ) + + "\n" + ) + with gzip.open(path, "wt", encoding="utf-8") as f: + f.write(content) + env.test_files.append(str(path)) + + # Build with small interval (1000ms) + _ = AggregatorUtility().process( + env.temp_dir, + index_dir=env.temp_dir, + force_rebuild=True, + time_interval_ms=1000.0, + ) + + # Query with larger interval (5000ms) - shrinking is lossless + result = AggregatorUtility().process( + env.temp_dir, + index_dir=env.temp_dir, + force_rebuild=False, + time_interval_ms=5000.0, + ) + + pa = pytest.importorskip("pyarrow") + batches = [pa.record_batch(batch) for batch in result.batches()] + assert len(batches) > 0 + + # CI columns should NOT be present for shrinking (lossless) + schema_names = set(batches[0].schema.names) + assert "count_ci_lower" not in schema_names + assert "count_ci_upper" not in schema_names + + def test_process_with_query_filter(self): + """Query parameter filters aggregation results.""" + with Environment(lines=0) as env: + path = Path(env.temp_dir) / "query_trace.pfw.gz" + content = ( + "\n".join( + [ + '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}', + '{"name":"compute","cat":"APP","pid":1,"tid":1,"ts":2000,"dur":20,"ph":"X","args":{"ret":8,"hhash":"h2","fhash":"f2"}}', + '{"name":"write","cat":"POSIX","pid":1,"tid":1,"ts":3000,"dur":15,"ph":"X","args":{"ret":6,"hhash":"h3","fhash":"f3"}}', + ] + ) + + "\n" + ) + with gzip.open(path, "wt", encoding="utf-8") as f: + f.write(content) + env.test_files.append(str(path)) + + # Query for POSIX only + result = AggregatorUtility().process( + env.temp_dir, + index_dir=env.temp_dir, + force_rebuild=True, + query='cat == "POSIX"', + ) + + pa = pytest.importorskip("pyarrow") + batches = [pa.record_batch(batch) for batch in result.batches()] + rows = pa.Table.from_batches(batches).to_pylist() + + # Should only have POSIX entries + assert len(rows) == 2 + assert all(row["cat"] == "POSIX" for row in rows) + + def test_iter_arrow_with_query_filter(self): + """Query parameter filters streaming results.""" + with Environment(lines=0) as env: + path = Path(env.temp_dir) / "iter_query_trace.pfw.gz" + content = ( + "\n".join( + [ + '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}', + '{"name":"compute","cat":"APP","pid":1,"tid":1,"ts":2000,"dur":20,"ph":"X","args":{"ret":8,"hhash":"h2","fhash":"f2"}}', + ] + ) + + "\n" + ) + with gzip.open(path, "wt", encoding="utf-8") as f: + f.write(content) + env.test_files.append(str(path)) + + util = AggregatorUtility() + batches = list( + util.iter_arrow( + env.temp_dir, + index_dir=env.temp_dir, + force_rebuild=True, + query='cat == "APP"', + ) + ) + + pa = pytest.importorskip("pyarrow") + pa_batches = [pa.record_batch(b) for b in batches] + rows = pa.Table.from_batches(pa_batches).to_pylist() + + # Should only have APP entries + assert len(rows) == 1 + assert rows[0]["cat"] == "APP" + + def test_write_arrow_creates_files(self): + """write_arrow creates Arrow IPC files.""" + with Environment(lines=20) as env: + env.create_test_gzip_file() + output_dir = Path(env.temp_dir) / "arrow_output" + + result = AggregatorUtility().write_arrow( + env.temp_dir, + str(output_dir), + index_dir=env.temp_dir, + force_rebuild=True, + ) + + assert "views" in result + assert "all" in result["views"] + assert result["views"]["all"]["rows"] > 0 + assert len(result["views"]["all"]["files"]) > 0 + + # Verify files are readable + ipc = pytest.importorskip("pyarrow.ipc") + first_file = result["views"]["all"]["files"][0] + with ipc.open_file(first_file) as f: + assert f.num_record_batches > 0 + + def test_write_arrow_with_views(self): + """write_arrow with views creates filtered outputs.""" + with Environment(lines=0) as env: + path = Path(env.temp_dir) / "views_trace.pfw.gz" + content = ( + "\n".join( + [ + '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}', + '{"name":"compute","cat":"APP","pid":1,"tid":1,"ts":2000,"dur":20,"ph":"X","args":{"ret":8,"hhash":"h2","fhash":"f2"}}', + '{"name":"write","cat":"POSIX","pid":1,"tid":1,"ts":3000,"dur":15,"ph":"X","args":{"ret":6,"hhash":"h3","fhash":"f3"}}', + ] + ) + + "\n" + ) + with gzip.open(path, "wt", encoding="utf-8") as f: + f.write(content) + env.test_files.append(str(path)) + + output_dir = Path(env.temp_dir) / "views_output" + + result = AggregatorUtility().write_arrow( + env.temp_dir, + str(output_dir), + index_dir=env.temp_dir, + force_rebuild=True, + views=[ + {"name": "io", "query": 'cat == "POSIX"'}, + {"name": "compute", "query": 'cat == "APP"'}, + ], + ) + + assert "io" in result["views"] + assert "compute" in result["views"] + assert result["views"]["io"]["rows"] == 2 + assert result["views"]["compute"]["rows"] == 1 + + # Verify view directories exist + assert (output_dir / "io").exists() + assert (output_dir / "compute").exists() + + # Verify content is filtered correctly + pa = pytest.importorskip("pyarrow") + ipc = pytest.importorskip("pyarrow.ipc") + + io_files = list((output_dir / "io").glob("*.arrow")) + assert len(io_files) > 0 + with ipc.open_file(str(io_files[0])) as f: + table = pa.Table.from_batches([f.get_batch(i) for i in range(f.num_record_batches)]) + assert all(row["cat"] == "POSIX" for row in table.to_pylist()) + + def test_write_arrow_compression(self): + """write_arrow respects compression setting.""" + with Environment(lines=20) as env: + env.create_test_gzip_file() + + # Write with no compression + output_none = Path(env.temp_dir) / "arrow_none" + result_none = AggregatorUtility().write_arrow( + env.temp_dir, + str(output_none), + index_dir=env.temp_dir, + force_rebuild=True, + compression="none", + ) + + # Write with zstd compression + output_zstd = Path(env.temp_dir) / "arrow_zstd" + result_zstd = AggregatorUtility().write_arrow( + env.temp_dir, + str(output_zstd), + index_dir=env.temp_dir, + force_rebuild=False, + compression="zstd", + ) + + # Both should have same row count + assert result_none["total_rows"] == result_zstd["total_rows"] + + # Compressed should be smaller (or at least both readable) + ipc = pytest.importorskip("pyarrow.ipc") + + for result in [result_none, result_zstd]: + first_file = result["views"]["all"]["files"][0] + with ipc.open_file(first_file) as f: + assert f.num_record_batches > 0 diff --git a/tests/python/test_dask.py b/tests/python/test_dask.py index 4ebd6a9f..9d3f0108 100644 --- a/tests/python/test_dask.py +++ b/tests/python/test_dask.py @@ -18,6 +18,7 @@ DASK_AVAILABLE = False import dftracer.utils as dft_utils +from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer from .common import Environment @@ -44,7 +45,7 @@ def test_parallel_indexer_creation(self): def create_and_build_indexer(gz_file): """Helper function to create and build an indexer""" try: - with dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) as indexer: + with NativeIndexer(gz_file, checkpoint_size=256 * 1024) as indexer: if indexer.need_rebuild(): indexer.build() return { @@ -107,7 +108,7 @@ def read_chunk(gz_file_path, start_bytes, end_bytes, reader_type): return {"type": reader_type, "error": str(e), "success": False} # Get file info from a temporary indexer - with dft_utils.Indexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer: + with NativeIndexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer: max_bytes = temp_indexer.get_max_bytes() chunk_size = max_bytes // 4 @@ -184,7 +185,7 @@ def extract_json_data(gz_file_path, start_bytes, end_bytes): return [] # Get file info and create chunks - with dft_utils.Indexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer: + with NativeIndexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer: max_bytes = temp_indexer.get_max_bytes() chunk_size = max_bytes // 4 @@ -229,7 +230,7 @@ def test_multiple_batch_sizes_no_duplication(self): gz_file = env.create_test_gzip_file(bytes_per_line=512) env.build_index(gz_file, checkpoint_size_bytes=256 * 1024) - with dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer: + with NativeIndexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer: max_bytes = temp_indexer.get_max_bytes() # Test various batch sizes including boundary-critical ones @@ -366,7 +367,7 @@ def test_boundary_edge_cases(self): gz_file = env.create_test_gzip_file(bytes_per_line=512) env.build_index(gz_file, checkpoint_size_bytes=256 * 1024) - with dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer: + with NativeIndexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer: max_bytes = temp_indexer.get_max_bytes() def process_batch(batch_info): @@ -446,5 +447,82 @@ def process_batch(batch_info): print("Boundary edge case test passed: Complete data recovery, no duplicates") +@pytest.mark.skipif(not DASK_AVAILABLE, reason="Dask not available") +class TestDirectoryIndexerWithDask: + """Tests for the directory-level Indexer API with Dask.""" + + def test_directory_indexer_indexes_all_files(self): + """Test that directory-level Indexer indexes all files in a directory.""" + with Environment(lines=100) as env: + # Create multiple test files in the same directory + gz_files = [] + for i in range(3): + gz_file = env.create_test_gzip_file(f"test_{i}.pfw.gz", bytes_per_line=256) + gz_files.append(gz_file) + + # Use directory-level Indexer + indexer = dft_utils.Indexer(env.temp_dir) + + # Check status before build + before = indexer.resolve() + assert before.total_files == 3 + assert len(before.needs_work) == 3 + assert len(before.ready) == 0 + + # Build indexes + indexer.build() + + # Check status after build + after = indexer.resolve() + assert after.total_files == 3 + assert len(after.ready) == 3 + assert len(after.needs_work) == 0 + + def test_directory_indexer_with_dask_parallel_reading(self): + """Test directory-level Indexer followed by parallel reading with Dask.""" + with Environment(lines=500) as env: + # Create test files + gz_files = [] + for i in range(3): + gz_file = env.create_test_gzip_file(f"test_{i}.pfw.gz", bytes_per_line=512) + gz_files.append(gz_file) + + # Use directory-level Indexer to build all indexes at once + indexer = dft_utils.Indexer(env.temp_dir) + indexer.ensure_indexed() + + # Verify all files are indexed + status = indexer.resolve() + assert len(status.ready) == 3 + + # Now use Dask for parallel reading + def read_file_lines(gz_file): + with dft_utils.TraceReader(gz_file) as reader: + return len(reader.read_lines()) + + delayed_tasks = [dask.delayed(read_file_lines)(f) for f in gz_files] + results = dask.compute(*delayed_tasks) + + # Each file should have 500 events + 2 JSON wrapper lines ([ and ]) + for line_count in results: + assert line_count == 502 + + def test_directory_indexer_ensure_indexed_idempotent(self): + """Test that ensure_indexed is idempotent - calling multiple times is safe.""" + with Environment(lines=50) as env: + env.create_test_gzip_file() + + indexer = dft_utils.Indexer(env.temp_dir) + + # First call builds the index + status1 = indexer.ensure_indexed() + assert len(status1.ready) == 1 + + # Second call should find everything already indexed + status2 = indexer.ensure_indexed() + assert len(status2.ready) == 1 + assert len(status2.needs_work) == 0 + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/python/test_distributed_manifest.py b/tests/python/test_distributed_manifest.py new file mode 100644 index 00000000..c24d183d --- /dev/null +++ b/tests/python/test_distributed_manifest.py @@ -0,0 +1,204 @@ +"""Tests for the distributed-build path producing a unified-shape index. + +Covers per-file AGG markers, ensure_indexed no-op after build, and end-to-end +correctness vs a serial single-node build. +""" + +import os + +import pytest + +try: + import dask # noqa: F401 + + DASK_AVAILABLE = True +except ImportError: + DASK_AVAILABLE = False + +from dftracer.utils import AggregationConfig, Indexer +from dftracer.utils.dask import distributed_index + +from .common import Environment + +AGG_CFG = AggregationConfig(time_interval_ms=5000) + + +def _build_distributed(env, pids, num_events=100, rebuild_root=True): + files = [env.create_dft_trace_file_with_pid(f"trace_p{p}.pfw.gz", p, num_events) for p in pids] + index_dir = os.path.join(env.temp_dir, "idx") + os.makedirs(index_dir, exist_ok=True) + index_path = os.path.join(index_dir, ".dftindex") + staging = os.path.join(env.temp_dir, "stage") + os.makedirs(staging, exist_ok=True) + result = distributed_index( + files=files, + index_path=index_path, + local_staging=staging, + lustre_staging=staging, + client=None, + aggregation_config=AGG_CFG, + rebuild_root_summaries=rebuild_root, + ) + return files, index_path, result + + +@pytest.mark.skipif(not DASK_AVAILABLE, reason="Dask not available") +class TestDistributedIndexUnified: + def test_no_manifest_left_behind(self): + with Environment(lines=50) as env: + _, index_path, _ = _build_distributed(env, pids=[1, 2]) + manifest_path = os.path.join(index_path, "agg_manifest.json") + assert not os.path.exists(manifest_path), ( + "distributed_index should produce a unified-shape index" + ) + + def test_aggregation_matches_serial(self): + """Distributed build's aggregation data must equal a serial build.""" + with Environment(lines=200) as env: + files_dist, dist_index_path, _ = _build_distributed(env, pids=[1, 2]) + + uni_index_dir = os.path.join(env.temp_dir, "idx_uni") + os.makedirs(uni_index_dir, exist_ok=True) + uni_indexer = Indexer( + files=files_dist, + index_dir=uni_index_dir, + require_aggregation=AGG_CFG, + force_rebuild=True, + ) + uni_indexer.ensure_indexed() + uni_batches = uni_indexer.iter_arrow_dfanalyzer_all( + time_granularity=5.0, + time_resolution=1_000_000.0, + ) + + dist_indexer = Indexer( + files=files_dist, + index_dir=os.path.dirname(dist_index_path), + require_aggregation=AGG_CFG, + force_rebuild=False, + ) + dist_batches = dist_indexer.iter_arrow_dfanalyzer_all( + time_granularity=5.0, + time_resolution=1_000_000.0, + ) + + import pyarrow as pa + + def _total_count(batches_dict, key): + batches = [pa.record_batch(b) for b in batches_dict.get(key, [])] + if not batches: + return 0 + table = pa.Table.from_batches(batches) + if "count" in table.column_names: + return int(pa.compute.sum(table["count"]).as_py() or 0) + return table.num_rows + + uni_count = _total_count(uni_batches, "events") + dist_count = _total_count(dist_batches, "events") + assert uni_count == dist_count, ( + f"event count mismatch: unified={uni_count} distributed={dist_count}" + ) + assert uni_count > 0 + + def test_move_artifacts_preserves_per_file_agg_ssts(self): + """With cross-FS staging, per-file `aggregation.sst` must not collapse.""" + with Environment(lines=120) as env: + files = [ + env.create_dft_trace_file_with_pid(f"trace_p{p}.pfw.gz", p, 120) + for p in [1, 2, 3, 4] + ] + local_staging = os.path.join(env.temp_dir, "local_stage") + lustre_staging = os.path.join(env.temp_dir, "lustre_stage") + os.makedirs(local_staging, exist_ok=True) + os.makedirs(lustre_staging, exist_ok=True) + index_dir = os.path.join(env.temp_dir, "idx") + os.makedirs(index_dir, exist_ok=True) + index_path = os.path.join(index_dir, ".dftindex") + + distributed_index( + files=files, + index_path=index_path, + local_staging=local_staging, + lustre_staging=lustre_staging, + client=None, + aggregation_config=AGG_CFG, + ) + + indexer = Indexer( + files=files, + index_dir=os.path.dirname(index_path), + require_aggregation=AGG_CFG, + force_rebuild=False, + ) + batches = indexer.iter_arrow_dfanalyzer_all( + time_granularity=5.0, + time_resolution=1_000_000.0, + ) + import pyarrow as pa + + event_batches = [pa.record_batch(b) for b in batches.get("events", [])] + assert event_batches, "no events: per-file SSTs likely clobbered each other" + table = pa.Table.from_batches(event_batches) + total = int(pa.compute.sum(table["count"]).as_py() or 0) + assert total > 0 + + def test_ensure_indexed_is_noop_after_distributed_build(self): + import time as _time + + with Environment(lines=150) as env: + files, index_path, _ = _build_distributed(env, pids=[1, 2, 3]) + + t0 = _time.monotonic() + indexer = Indexer( + files=files, + index_dir=os.path.dirname(index_path), + require_checkpoint=True, + require_bloom=True, + require_manifest=True, + require_aggregation=AGG_CFG, + force_rebuild=False, + ) + status = indexer.ensure_indexed() + elapsed = _time.monotonic() - t0 + + assert status.total_files == len(files) + assert len(status.ready) == len(files), ( + f"post-distributed ensure_indexed wants to rebuild " + f"{len(status.needs_work)} files (markers missing?)" + ) + assert len(status.needs_work) == 0 + assert elapsed < 5.0, ( + f"ensure_indexed took {elapsed:.2f}s on a {len(files)}-file " + "distributed index; likely re-running the build" + ) + + +@pytest.mark.skipif(not DASK_AVAILABLE, reason="Dask not available") +class TestDistributedWithDask: + def test_multi_worker_with_local_cluster(self): + from dask.distributed import Client, LocalCluster + + with LocalCluster( + n_workers=2, threads_per_worker=1, dashboard_address=None, processes=True + ) as cluster, Client(cluster) as client: + with Environment(lines=40) as env: + pids = [1, 2, 3, 4] + files = [ + env.create_dft_trace_file_with_pid(f"trace_p{p}.pfw.gz", p, 40) for p in pids + ] + index_dir = os.path.join(env.temp_dir, "idx") + os.makedirs(index_dir, exist_ok=True) + index_path = os.path.join(index_dir, ".dftindex") + staging = os.path.join(env.temp_dir, "stage") + os.makedirs(staging, exist_ok=True) + result = distributed_index( + files=files, + index_path=index_path, + local_staging=staging, + lustre_staging=staging, + client=client, + aggregation_config=AGG_CFG, + ) + assert result["total_files"] == len(files) + assert result["artifact_batches"] > 0 + assert not os.path.exists(os.path.join(index_path, "agg_manifest.json")) diff --git a/tests/python/test_indexer.py b/tests/python/test_indexer.py index 8079c833..9951a636 100644 --- a/tests/python/test_indexer.py +++ b/tests/python/test_indexer.py @@ -8,129 +8,67 @@ import pytest import dftracer.utils as dft_utils +from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer from .common import Environment -class TestIndexer: - """Test cases for Indexer""" +class TestCheckpointIndexer: + """Test cases for checkpoint-level indexer operations via get_checkpoint_indexer""" - def test_indexer_creation(self): - """Test indexer creation""" - with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) - - # Test basic creation using context manager - with dft_utils.Indexer(gz_file, index_path) as indexer: - assert indexer.gz_path == gz_file - assert indexer.index_path == index_path - assert indexer.checkpoint_size > 0 - - def test_indexer_creation_with_defaults(self): - """Test indexer creation with default parameters""" - with Environment() as env: - gz_file = env.create_test_gzip_file() - - # Test creation with defaults using context manager - with dft_utils.Indexer(gz_file) as indexer: - assert indexer.gz_path == gz_file - assert indexer.index_path == env.get_index_path(gz_file) - assert indexer.checkpoint_size <= 33554432 # Should be <= 32MB default - - def test_indexer_custom_checkpoint_size(self): - """Test indexer with custom checkpoint size""" + def test_checkpoint_indexer_creation(self): + """Test checkpoint indexer creation via Indexer.get_checkpoint_indexer""" with Environment() as env: gz_file = env.create_test_gzip_file() - checkpoint_size = 1024 * 1024 # 1MB - with dft_utils.Indexer(gz_file, checkpoint_size=checkpoint_size) as indexer: - assert indexer.checkpoint_size <= checkpoint_size - - def test_indexer_nonexistent_file(self): - """Test indexer creation with non-existent file""" - # Indexer creation doesn't fail, but building should fail - with pytest.raises(RuntimeError): - dft_utils.Indexer("nonexistent_file.gz") - - def test_indexer_build_and_rebuild(self): - """Test indexer build and rebuild functionality""" - with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) - - with dft_utils.Indexer(gz_file, index_path) as indexer: - # Should need rebuild initially - assert indexer.need_rebuild() - - # Build the index - indexer.build() - - # Index file should exist - assert os.path.exists(index_path) - - # Should not need rebuild after building - assert not indexer.need_rebuild() + with dft_utils.Indexer(files=[gz_file]) as indexer: + indexer.ensure_indexed() + cp_indexer = indexer.get_checkpoint_indexer(gz_file) - # Test force rebuild with a new indexer - # Note: force_rebuild affects the build process, not need_rebuild() check - # The need_rebuild() method checks file consistency, not force_rebuild flag - with dft_utils.Indexer(gz_file, index_path, force_rebuild=True) as indexer_force: - # Since the index already exists and file hasn't changed, need_rebuild should be False - # But force_rebuild will cause a rebuild when build() is called - assert not indexer_force.need_rebuild() - # The force_rebuild behavior is tested by calling build() which should succeed - indexer_force.build() # This should rebuild due to force_rebuild=True + assert cp_indexer.gz_path == gz_file + assert cp_indexer.checkpoint_size > 0 - def test_indexer_file_info(self): - """Test indexer file information methods""" + def test_checkpoint_indexer_file_info(self): + """Test checkpoint indexer file information methods""" with Environment() as env: gz_file = env.create_test_gzip_file() - with dft_utils.Indexer(gz_file) as indexer: - if indexer.need_rebuild(): - indexer.build() + with dft_utils.Indexer(files=[gz_file]) as indexer: + indexer.ensure_indexed() + cp_indexer = indexer.get_checkpoint_indexer(gz_file) - # Test file info methods - max_bytes = indexer.get_max_bytes() - num_lines = indexer.get_num_lines() + max_bytes = cp_indexer.get_max_bytes() + num_lines = cp_indexer.get_num_lines() assert isinstance(max_bytes, int) assert isinstance(num_lines, int) assert max_bytes > 0 assert num_lines > 0 - def test_indexer_checkpoints(self): - """Test indexer checkpoint functionality""" - with Environment(lines=100000) as env: # Larger file for checkpoints + def test_checkpoint_indexer_checkpoints(self): + """Test checkpoint indexer checkpoint functionality""" + with Environment(lines=100000) as env: gz_file = env.create_test_gzip_file() - checkpoint_size = 256 * 1024 # 256KB checkpoint size + checkpoint_size = 256 * 1024 # 256KB - with dft_utils.Indexer(gz_file, checkpoint_size=checkpoint_size) as indexer: - if indexer.need_rebuild(): - indexer.build() + with dft_utils.Indexer( + files=[gz_file], + checkpoint_size=checkpoint_size, + ) as indexer: + indexer.ensure_indexed() + cp_indexer = indexer.get_checkpoint_indexer(gz_file) - # Debug: Check file size and checkpoint configuration - max_bytes = indexer.get_max_bytes() - num_lines = indexer.get_num_lines() + max_bytes = cp_indexer.get_max_bytes() + num_lines = cp_indexer.get_num_lines() print( - f"File stats: {max_bytes} bytes, {num_lines} lines, checkpoint_size={checkpoint_size}" + f"File stats: {max_bytes} bytes, {num_lines} lines, " + f"checkpoint_size={checkpoint_size}" ) - # Test get_checkpoints - checkpoints = indexer.get_checkpoints() + checkpoints = cp_indexer.get_checkpoints() assert isinstance(checkpoints, list) print(f"Number of checkpoints created: {len(checkpoints)}") - # NOTE: Checkpoint creation depends on deflate block boundaries in the compressed stream, - # not just uncompressed file size. This is correct behavior for zlib-based random access. - # The indexer may create 0, 1, or multiple checkpoints depending on how gzip compressed - # the data and where deflate block boundaries fall relative to the checkpoint size. - - # Test that the API works correctly regardless of checkpoint count - assert isinstance(checkpoints, list) - - # Test checkpoint properties if any exist for checkpoint in checkpoints: assert hasattr(checkpoint, "checkpoint_idx") assert hasattr(checkpoint, "uc_offset") @@ -147,28 +85,28 @@ def test_indexer_checkpoints(self): assert checkpoint.uc_offset >= 0 assert checkpoint.num_lines >= 0 - def test_indexer_find_checkpoint(self): - """Test indexer single checkpoint search""" - with Environment(lines=2000) as env: # Large file for testing - gz_file = env.create_test_gzip_file(bytes_per_line=2048) # Larger lines - checkpoint_size = 512 * 1024 # 512KB checkpoint size + def test_checkpoint_indexer_find_checkpoint(self): + """Test checkpoint indexer single checkpoint search""" + with Environment(lines=2000) as env: + gz_file = env.create_test_gzip_file(bytes_per_line=2048) + checkpoint_size = 512 * 1024 # 512KB - with dft_utils.Indexer(gz_file, checkpoint_size=checkpoint_size) as indexer: - if indexer.need_rebuild(): - indexer.build() + with dft_utils.Indexer( + files=[gz_file], + checkpoint_size=checkpoint_size, + ) as indexer: + indexer.ensure_indexed() + cp_indexer = indexer.get_checkpoint_indexer(gz_file) - max_bytes = indexer.get_max_bytes() - checkpoints = indexer.get_checkpoints() + max_bytes = cp_indexer.get_max_bytes() + checkpoints = cp_indexer.get_checkpoints() print(f"File has {max_bytes} bytes and {len(checkpoints)} checkpoints") - # Test find_checkpoint API regardless of whether checkpoints exist target_offset = max_bytes // 2 if max_bytes > 0 else 0 - checkpoint = indexer.find_checkpoint(target_offset) + checkpoint = cp_indexer.find_checkpoint(target_offset) - # The find_checkpoint method should always return either a CheckpointInfo or None if checkpoint is not None: - # If a checkpoint is found, verify its properties assert hasattr(checkpoint, "uc_offset") assert hasattr(checkpoint, "uc_size") assert hasattr(checkpoint, "num_lines") @@ -177,78 +115,106 @@ def test_indexer_find_checkpoint(self): assert isinstance(checkpoint.uc_size, int) assert isinstance(checkpoint.num_lines, int) - # Test with offset 0 (per the C++ code, this should return None as a special case) - checkpoint_0 = indexer.find_checkpoint(0) - # According to indexer.cpp line 1104-1106, target_offset 0 always returns false - assert checkpoint_0 is None, ( - "find_checkpoint(0) should return None per implementation" - ) + # find_checkpoint(0) should return None per implementation + checkpoint_0 = cp_indexer.find_checkpoint(0) + assert checkpoint_0 is None - # Test with offset beyond file size if max_bytes > 0: - checkpoint_beyond = indexer.find_checkpoint(max_bytes + 1000) - # This might return None or the last checkpoint, both are valid + checkpoint_beyond = cp_indexer.find_checkpoint(max_bytes + 1000) if checkpoint_beyond is not None: assert checkpoint_beyond.uc_offset <= max_bytes -class TestIndexerIntegration: - """Integration tests for indexer with reader""" +class TestNativeIndexerDirect: + """Test native Indexer class directly for low-level operations""" - def test_indexer_with_reader_creation(self): - """Test creating readers from indexer""" + def test_native_indexer_creation(self): + """Test native indexer creation""" with Environment() as env: gz_file = env.create_test_gzip_file() + index_path = env.get_index_path(gz_file) - # Create and build indexer using context manager - with dft_utils.Indexer(gz_file) as indexer: - if indexer.need_rebuild(): - indexer.build() + with NativeIndexer(gz_file, index_path) as indexer: + assert indexer.gz_path == gz_file + assert indexer.index_path == index_path + assert indexer.checkpoint_size > 0 - # Test creating reader after indexer builds the shared index store - reader = dft_utils.TraceReader(gz_file) - assert reader.get_max_bytes() > 0 - assert reader.file_path == gz_file + def test_native_indexer_build_and_rebuild(self): + """Test native indexer build and rebuild functionality""" + with Environment() as env: + gz_file = env.create_test_gzip_file() + index_path = env.get_index_path(gz_file) - def test_indexer_with_reader_creation_context_manager(self): - """Test using indexer with reader creation via context manager""" + with NativeIndexer(gz_file, index_path) as indexer: + assert indexer.need_rebuild() + indexer.build() + assert os.path.exists(index_path) + assert not indexer.need_rebuild() + + with NativeIndexer(gz_file, index_path, force_rebuild=True) as indexer_force: + assert not indexer_force.need_rebuild() + indexer_force.build() + + def test_native_indexer_nonexistent_file(self): + """Test native indexer creation with non-existent file""" + with pytest.raises(RuntimeError): + NativeIndexer("nonexistent_file.gz") + + def test_native_indexer_build_bloom(self): + """Test building with bloom=True""" with Environment() as env: gz_file = env.create_test_gzip_file() + index_path = env.get_index_path(gz_file) + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: + indexer.build() + assert indexer.has_bloom - # Create and build indexer using context manager - with dft_utils.Indexer(gz_file) as indexer: - if indexer.need_rebuild(): - indexer.build() + def test_native_indexer_build_manifest(self): + """Test building with manifest=True""" + with Environment() as env: + gz_file = env.create_test_gzip_file() + index_path = env.get_index_path(gz_file) + with NativeIndexer(gz_file, index_path, build_manifest=True) as indexer: + indexer.build() + assert indexer.has_manifest + + +class TestCheckpointIndexerIntegration: + """Integration tests for checkpoint indexer with reader""" + + def test_checkpoint_indexer_with_reader_creation(self): + """Test creating readers from checkpoint indexer""" + with Environment() as env: + gz_file = env.create_test_gzip_file() + + with dft_utils.Indexer(files=[gz_file]) as indexer: + indexer.ensure_indexed() - # Test creating reader after indexer builds the shared index store reader = dft_utils.TraceReader(gz_file) assert reader.get_max_bytes() > 0 + assert reader.path == gz_file - def test_multiple_readers_same_indexer(self): - """Test creating multiple readers from the same indexer""" + def test_multiple_readers_same_index(self): + """Test creating multiple readers from the same index""" with Environment() as env: gz_file = env.create_test_gzip_file() - # Create and build indexer using context manager - with dft_utils.Indexer(gz_file) as indexer: - if indexer.need_rebuild(): - indexer.build() + with dft_utils.Indexer(files=[gz_file]) as indexer: + indexer.ensure_indexed() - # Create multiple readers (all use the same shared index store) readers = [] for i in range(3): reader = dft_utils.TraceReader(gz_file) assert reader.get_max_bytes() > 0 readers.append(reader) - # All should have same file info max_bytes = readers[0].get_max_bytes() for reader in readers[1:]: assert reader.get_max_bytes() == max_bytes -class TestIndexerLifetime: - """Python wrapper lifetime should not own the shared index store.""" +class TestCheckpointIndexerLifetime: + """Test checkpoint indexer lifetime management""" def test_indexer_close_releases_wrapper_not_index_store(self): """close() should release the Python handle without deleting .dftindex.""" @@ -256,7 +222,7 @@ def test_indexer_close_releases_wrapper_not_index_store(self): gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - indexer = dft_utils.Indexer(gz_file, index_path) + indexer = NativeIndexer(gz_file, index_path) assert indexer.need_rebuild() indexer.build() assert os.path.exists(index_path) @@ -264,7 +230,7 @@ def test_indexer_close_releases_wrapper_not_index_store(self): indexer.close() assert os.path.exists(index_path) - with dft_utils.Indexer(gz_file, index_path) as reopened: + with NativeIndexer(gz_file, index_path) as reopened: assert not reopened.need_rebuild() assert reopened.get_num_lines() > 0 @@ -274,7 +240,7 @@ def test_indexer_context_exit_keeps_shared_index_store(self): gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer(gz_file, index_path) as indexer: + with NativeIndexer(gz_file, index_path) as indexer: if indexer.need_rebuild(): indexer.build() assert indexer.get_num_lines() > 0 @@ -285,212 +251,484 @@ def test_indexer_context_exit_keeps_shared_index_store(self): assert reader.get_num_lines() > 0 -class TestIndexerUnified: - """Test unified IndexBuilder features via Python Indexer""" +class TestDirectoryIndexer: + """Test cases for the directory-level Indexer API""" - def test_indexer_build_bloom(self): - """Test building with bloom=True""" + def test_indexer_creation(self): + """Test directory indexer creation""" + with Environment() as env: + env.create_test_gzip_file() + env.create_test_gzip_file() + + indexer = dft_utils.Indexer(env.temp_dir) + assert indexer is not None + + def test_indexer_context_manager(self): + """Test directory indexer as context manager""" + with Environment() as env: + env.create_test_gzip_file() + + with dft_utils.Indexer(env.temp_dir) as indexer: + assert indexer is not None + + def test_indexer_resolve(self): + """Test resolve() returns IndexStatus""" + with Environment() as env: + env.create_test_gzip_file() + + with dft_utils.Indexer(env.temp_dir) as indexer: + status = indexer.resolve() + assert isinstance(status, dft_utils.IndexStatus) + assert status.total_files >= 1 + assert len(status.needs_work) >= 1 + + def test_indexer_build(self): + """Test build() creates indexes""" with Environment() as env: gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + print(f"\nCreated test file: {gz_file}") + print(f"Directory: {env.temp_dir}") + print(f"Files in dir: {os.listdir(env.temp_dir)}") + + with dft_utils.Indexer(env.temp_dir) as indexer: + status_before = indexer.resolve() + print(f"Before build: {status_before}") + assert len(status_before.needs_work) >= 1 + assert status_before.index_path != "" + indexer.build() - assert indexer.has_bloom - def test_indexer_build_manifest(self): - """Test building with manifest=True""" + assert os.path.isdir(status_before.index_path), ( + f"Index dir not created: {status_before.index_path}" + ) + print(f"Index dir contents: {os.listdir(status_before.index_path)}") + + status_after = indexer.resolve() + print(f"After build: {status_after}") + assert len(status_after.ready) >= 1, ( + f"Expected ready>=1, got {len(status_after.ready)}" + ) + + def test_indexer_ensure_indexed(self): + """Test ensure_indexed() builds if needed""" with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) + env.create_test_gzip_file() + + with dft_utils.Indexer(env.temp_dir) as indexer: + status = indexer.ensure_indexed() + assert isinstance(status, dft_utils.IndexStatus) + assert len(status.ready) >= 1 + + def test_indexer_with_require_bloom(self): + """Test indexer with bloom filter requirement""" + with Environment() as env: + env.create_test_gzip_file() + + with dft_utils.Indexer(env.temp_dir, require_bloom=True) as indexer: + status = indexer.ensure_indexed() + assert len(status.ready) >= 1 + + def test_indexer_with_require_manifest(self): + """Test indexer with manifest requirement""" + with Environment() as env: + env.create_test_gzip_file() + + with dft_utils.Indexer(env.temp_dir, require_manifest=True) as indexer: + status = indexer.ensure_indexed() + assert len(status.ready) >= 1 + + def test_indexer_with_aggregation_config(self): + """Test indexer with aggregation config""" + with Environment() as env: + env.create_test_gzip_file() + + agg_config = dft_utils.AggregationConfig( + time_interval_ms=1000.0, + compute_percentiles=False, + ) with dft_utils.Indexer( - gz_file, index_path, build_manifest=True, index_threshold=0 + env.temp_dir, + require_aggregation=agg_config, ) as indexer: - indexer.build() - assert indexer.has_manifest + assert indexer.aggregation_config is not None + assert indexer.aggregation_config.time_interval_ms == 1000.0 - def test_indexer_build_bloom_and_manifest(self): - """Test building with both bloom and manifest""" + def test_indexer_aggregation_true(self): + """Test indexer with require_aggregation=True uses defaults""" with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) + env.create_test_gzip_file() + with dft_utils.Indexer( - gz_file, - index_path, - build_bloom=True, - build_manifest=True, - index_threshold=0, + env.temp_dir, + require_aggregation=True, ) as indexer: - indexer.build() - assert indexer.has_bloom - assert indexer.has_manifest + assert indexer.aggregation_config is not None + assert indexer.aggregation_config.time_interval_ms == 5000.0 + + def test_index_status_dataclass(self): + """Test IndexStatus dataclass""" + status = dft_utils.IndexStatus( + total_files=5, + ready=["a.pfw.gz", "b.pfw.gz"], + needs_work=["c.pfw.gz"], + index_path="/tmp/index", + ) + assert status.total_files == 5 + assert len(status.ready) == 2 + assert len(status.needs_work) == 1 + assert status.index_path == "/tmp/index" + + def test_aggregation_config_dataclass(self): + """Test AggregationConfig dataclass""" + config = dft_utils.AggregationConfig( + time_interval_ms=2000.0, + group_keys=["host", "rank"], + custom_metric_fields=["bytes"], + compute_percentiles=True, + ) + assert config.time_interval_ms == 2000.0 + assert config.group_keys == ["host", "rank"] + assert config.custom_metric_fields == ["bytes"] + assert config.compute_percentiles is True + + def test_indexer_with_files_list(self): + """Test indexer with explicit file list instead of directory""" + with Environment() as env: + file_path = env.create_test_gzip_file() + + with dft_utils.Indexer( + files=[file_path], + index_dir=env.temp_dir, + ) as indexer: + status = indexer.resolve() + assert status.total_files == 1 - def test_indexer_no_bloom_by_default(self): - """Test that bloom is not built when build_bloom is omitted""" + def test_indexer_files_and_directory(self): + """Test indexer with both files and directory (files take precedence)""" with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) - with dft_utils.Indexer(gz_file, index_path, index_threshold=0) as indexer: - indexer.build() - assert not indexer.has_bloom + file_path = env.create_test_gzip_file() - def test_indexer_no_manifest_by_default(self): - """Test that manifest is not built when build_manifest is omitted""" + with dft_utils.Indexer( + directory=env.temp_dir, + files=[file_path], + ) as indexer: + status = indexer.resolve() + assert status.total_files >= 1 + + def test_indexer_requires_directory_or_files(self): + """Test that indexer requires at least directory or files""" + with pytest.raises(ValueError, match="directory.*files"): + dft_utils.Indexer() + + def test_indexer_get_checkpoint_indexer(self): + """Test get_checkpoint_indexer returns working checkpoint indexer""" with Environment() as env: gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) - with dft_utils.Indexer(gz_file, index_path, index_threshold=0) as indexer: - indexer.build() - assert not indexer.has_manifest - def test_indexer_has_bloom_is_bool(self): - """Test that has_bloom returns a bool""" + with dft_utils.Indexer(env.temp_dir) as indexer: + indexer.ensure_indexed() + + cp_indexer = indexer.get_checkpoint_indexer(gz_file) + + assert cp_indexer.gz_path == gz_file + assert cp_indexer.get_max_bytes() > 0 + assert cp_indexer.get_num_lines() > 0 + checkpoints = cp_indexer.get_checkpoints() + assert isinstance(checkpoints, list) + + def test_indexer_get_checkpoint_indexer_uses_index_dir(self): + """Test that get_checkpoint_indexer uses the same index_dir""" with Environment() as env: gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) + custom_index_dir = os.path.join(env.temp_dir, "custom_index") + os.makedirs(custom_index_dir, exist_ok=True) + with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 + env.temp_dir, + index_dir=custom_index_dir, ) as indexer: - indexer.build() - assert isinstance(indexer.has_bloom, bool) + indexer.ensure_indexed() + + cp_indexer = indexer.get_checkpoint_indexer(gz_file) + assert custom_index_dir in cp_indexer.index_path - def test_indexer_has_manifest_is_bool(self): - """Test that has_manifest returns a bool""" + +class TestIndexerDfanalyzerAPIs: + """Test cases for dfanalyzer integration APIs (hash tables, PID manifest)""" + + def test_get_hash_table_file(self): + """Test get_hash_table returns file hash mappings""" with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) + gz_file = env.create_dft_trace_file() + with dft_utils.Indexer( - gz_file, index_path, build_manifest=True, index_threshold=0 + files=[gz_file], + require_bloom=True, + require_manifest=True, ) as indexer: - indexer.build() - assert isinstance(indexer.has_manifest, bool) + indexer.ensure_indexed() + + file_hashes = indexer.get_hash_table("file") + assert isinstance(file_hashes, dict) - def test_indexer_custom_index_threshold(self): - """Test that index_threshold is accepted without error""" + def test_get_hash_table_host(self): + """Test get_hash_table returns host hash mappings""" with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) - # A very large threshold skips bloom for small files + gz_file = env.create_dft_trace_file() + with dft_utils.Indexer( - gz_file, - index_path, - build_bloom=True, - index_threshold=1024 * 1024 * 1024, + files=[gz_file], + require_bloom=True, + require_manifest=True, ) as indexer: - indexer.build() - assert isinstance(indexer.has_bloom, bool) - assert not indexer.has_bloom + indexer.ensure_indexed() - def test_indexer_bloom_persists_across_instances(self): - """Bloom data written to the index store is visible from a new Indexer""" + host_hashes = indexer.get_hash_table("host") + assert isinstance(host_hashes, dict) + + def test_get_hash_table_string(self): + """Test get_hash_table returns string hash mappings""" with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) + gz_file = env.create_dft_trace_file() + with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 + files=[gz_file], + require_bloom=True, + require_manifest=True, ) as indexer: - indexer.build() + indexer.ensure_indexed() - with dft_utils.Indexer(gz_file, index_path) as indexer2: - assert indexer2.has_bloom + string_hashes = indexer.get_hash_table("string") + assert isinstance(string_hashes, dict) - def test_indexer_manifest_persists_across_instances(self): - """Manifest data written to the index store is visible from a new Indexer""" + def test_get_hash_table_invalid_type(self): + """Test get_hash_table raises error for invalid type""" with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) + gz_file = env.create_dft_trace_file() + with dft_utils.Indexer( - gz_file, index_path, build_manifest=True, index_threshold=0 + files=[gz_file], + require_bloom=True, + require_manifest=True, ) as indexer: - indexer.build() - - with dft_utils.Indexer(gz_file, index_path) as indexer2: - assert indexer2.has_manifest + indexer.ensure_indexed() + with pytest.raises((ValueError, RuntimeError)): + indexer.get_hash_table("invalid_type") -class TestIndexerThreshold: - """Test that index_threshold skips bloom/manifest for small files""" + def test_query_file_pids(self): + """Test query_file_pids returns set of PIDs for a file""" + with Environment() as env: + gz_file = env.create_dft_trace_file() - def test_threshold_skips_bloom_for_small_file(self): - """Explicit large threshold should skip bloom for small files""" - with Environment(lines=5) as env: - gz_file = env.create_test_gzip_file(bytes_per_line=128) - index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, - index_path, - build_bloom=True, - index_threshold=10 * 1024 * 1024, + files=[gz_file], + require_manifest=True, ) as indexer: - indexer.build() - assert not indexer.has_bloom + indexer.ensure_indexed() - def test_threshold_skips_manifest_for_small_file(self): - """Explicit large threshold should skip manifest for small files""" - with Environment(lines=5) as env: - gz_file = env.create_test_gzip_file(bytes_per_line=128) - index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, - index_path, - build_manifest=True, - index_threshold=10 * 1024 * 1024, - ) as indexer: - indexer.build() - assert not indexer.has_manifest + # File ID 1 is typically the first indexed file + pids = indexer.query_file_pids(1) + assert isinstance(pids, set) + # PIDs should be integers + for pid in pids: + assert isinstance(pid, int) + + def test_query_file_pids_nonexistent(self): + """Test query_file_pids returns empty set for nonexistent file""" + with Environment() as env: + gz_file = env.create_dft_trace_file() - def test_threshold_skips_bloom_and_manifest_for_small_file(self): - """Explicit large threshold should skip bloom and manifest for small files""" - with Environment(lines=5) as env: - gz_file = env.create_test_gzip_file(bytes_per_line=128) - index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, - index_path, - build_bloom=True, - build_manifest=True, - index_threshold=10 * 1024 * 1024, + files=[gz_file], + require_manifest=True, ) as indexer: - indexer.build() - assert not indexer.has_bloom - assert not indexer.has_manifest + indexer.ensure_indexed() - def test_explicit_large_threshold_skips_bloom(self): - """Explicit large threshold should skip bloom for small files""" + pids = indexer.query_file_pids(9999) + assert isinstance(pids, set) + assert len(pids) == 0 + + def test_query_all_file_pids(self): + """Test query_all_file_pids returns dict mapping file_id to PID sets""" with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) + gz_file1 = env.create_dft_trace_file(filename="trace1.pfw.gz") + gz_file2 = env.create_dft_trace_file(filename="trace2.pfw.gz") + with dft_utils.Indexer( - gz_file, - index_path, - build_bloom=True, - index_threshold=1024 * 1024 * 1024, + files=[gz_file1, gz_file2], + require_manifest=True, ) as indexer: - indexer.build() - assert not indexer.has_bloom + indexer.ensure_indexed() + + all_pids = indexer.query_all_file_pids() + assert isinstance(all_pids, dict) + + for file_id, pid_set in all_pids.items(): + assert isinstance(file_id, int) + assert isinstance(pid_set, set) + for pid in pid_set: + assert isinstance(pid, int) - def test_zero_threshold_forces_bloom(self): - """index_threshold=0 disables threshold, bloom should be built""" + def test_query_all_file_pids_empty_index(self): + """Test query_all_file_pids returns empty dict for unindexed files""" with Environment() as env: gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) + with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 + files=[gz_file], + require_manifest=False, ) as indexer: - indexer.build() - assert indexer.has_bloom + # Only checkpoint tier, no manifest + indexer.ensure_indexed() + + all_pids = indexer.query_all_file_pids() + assert isinstance(all_pids, dict) - def test_zero_threshold_forces_manifest(self): - """index_threshold=0 disables threshold, manifest should be built""" + def test_integration_hash_tables_and_pids(self): + """Integration test: hash tables and PIDs work together""" with Environment() as env: - gz_file = env.create_test_gzip_file() - index_path = env.get_index_path(gz_file) + gz_file = env.create_dft_trace_file() + with dft_utils.Indexer( - gz_file, index_path, build_manifest=True, index_threshold=0 + files=[gz_file], + require_bloom=True, + require_manifest=True, ) as indexer: - indexer.build() - assert indexer.has_manifest + indexer.ensure_indexed() + + # Get hash tables + file_hashes = indexer.get_hash_table("file") + host_hashes = indexer.get_hash_table("host") + + # Get PIDs + all_pids = indexer.query_all_file_pids() + + # Both should be populated for a valid DFT trace + assert isinstance(file_hashes, dict) + assert isinstance(host_hashes, dict) + assert isinstance(all_pids, dict) + + +class TestQueryFilter: + """Test cases for query filter parameter in iter_arrow_dfanalyzer APIs""" + + def _make_indexer(self, directory): + return dft_utils.Indexer( + directory=directory, + require_aggregation=dft_utils.AggregationConfig(time_interval_ms=5000), + ) + + def test_iter_arrow_dfanalyzer_all_no_query(self): + pa = pytest.importorskip("pyarrow") + with Environment() as env: + directory = env.create_indexed_traces(pids=[1]) + with self._make_indexer(directory) as indexer: + indexer.ensure_indexed() + result = indexer.iter_arrow_dfanalyzer_all() + rows = sum(pa.record_batch(b).num_rows for b in result.get("events", [])) + assert rows > 0 + + def test_iter_arrow_dfanalyzer_all_pid_filter(self): + pa = pytest.importorskip("pyarrow") + with Environment() as env: + directory = env.create_indexed_traces(pids=[1]) + with self._make_indexer(directory) as indexer: + indexer.ensure_indexed() + result = indexer.iter_arrow_dfanalyzer_all(query="pid == 1") + rows = sum(pa.record_batch(b).num_rows for b in result.get("events", [])) + assert rows > 0 + + def test_iter_arrow_dfanalyzer_all_pid_filter_reduces_rows(self): + pa = pytest.importorskip("pyarrow") + with Environment() as env: + directory = env.create_indexed_traces(pids=[1, 2]) + with self._make_indexer(directory) as indexer: + indexer.ensure_indexed() + + all_rows = sum( + pa.record_batch(b).num_rows + for b in indexer.iter_arrow_dfanalyzer_all().get("events", []) + ) + filtered_rows = sum( + pa.record_batch(b).num_rows + for b in indexer.iter_arrow_dfanalyzer_all(query="pid == 1").get("events", []) + ) + assert 0 < filtered_rows < all_rows + + def test_iter_arrow_dfanalyzer_all_invalid_query(self): + with Environment() as env: + directory = env.create_indexed_traces(pids=[1]) + with self._make_indexer(directory) as indexer: + indexer.ensure_indexed() + with pytest.raises((ValueError, RuntimeError)): + indexer.iter_arrow_dfanalyzer_all(query="invalid ==") + + def test_iter_arrow_dfanalyzer_query_param(self): + pa = pytest.importorskip("pyarrow") + with Environment() as env: + directory = env.create_indexed_traces(pids=[1]) + with self._make_indexer(directory) as indexer: + indexer.ensure_indexed() + batches = list(indexer.iter_arrow_dfanalyzer("events", query="pid == 1")) + rows = sum(pa.record_batch(b).num_rows for b in batches) + assert rows > 0 + + def test_iter_arrow_dfanalyzer_query_matches_all(self): + pa = pytest.importorskip("pyarrow") + with Environment() as env: + directory = env.create_indexed_traces(pids=[1]) + with self._make_indexer(directory) as indexer: + indexer.ensure_indexed() + + single_rows = sum( + pa.record_batch(b).num_rows + for b in indexer.iter_arrow_dfanalyzer("events", query="pid == 1") + ) + all_rows = sum( + pa.record_batch(b).num_rows + for b in indexer.iter_arrow_dfanalyzer_all(query="pid == 1").get("events", []) + ) + assert single_rows == all_rows + + def test_iter_arrow_dfanalyzer_all_multi_pid_filter(self): + pa = pytest.importorskip("pyarrow") + with Environment() as env: + directory = env.create_indexed_traces(pids=[10, 20, 30]) + with self._make_indexer(directory) as indexer: + indexer.ensure_indexed() + + filtered_rows = sum( + pa.record_batch(b).num_rows + for b in indexer.iter_arrow_dfanalyzer_all(query="pid == 10 or pid == 20").get( + "events", [] + ) + ) + all_rows = sum( + pa.record_batch(b).num_rows + for b in indexer.iter_arrow_dfanalyzer_all().get("events", []) + ) + assert 0 < filtered_rows < all_rows + + def test_iter_arrow_dfanalyzer_all_string_filter(self): + pa = pytest.importorskip("pyarrow") + with Environment() as env: + directory = env.create_indexed_traces(pids=[1]) + with self._make_indexer(directory) as indexer: + indexer.ensure_indexed() + result = indexer.iter_arrow_dfanalyzer_all(query='cat == "POSIX"') + rows = sum(pa.record_batch(b).num_rows for b in result.get("events", [])) + assert rows > 0 + + def test_iter_arrow_dfanalyzer_all_no_match(self): + pa = pytest.importorskip("pyarrow") + with Environment() as env: + directory = env.create_indexed_traces(pids=[1]) + with self._make_indexer(directory) as indexer: + indexer.ensure_indexed() + result = indexer.iter_arrow_dfanalyzer_all(query="pid == 999999") + rows = sum(pa.record_batch(b).num_rows for b in result.get("events", [])) + assert rows == 0 if __name__ == "__main__": diff --git a/tests/python/test_reorganization_planner.py b/tests/python/test_reorganization_planner.py index b5518fe6..c67ee297 100644 --- a/tests/python/test_reorganization_planner.py +++ b/tests/python/test_reorganization_planner.py @@ -1,8 +1,6 @@ """Tests for ReorganizationPlannerUtility.""" -import sys - -import dftracer.utils as dft_utils +from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer from dftracer.utils.dftracer_utils_ext import ReorganizationPlannerUtility from .common import Environment @@ -10,7 +8,6 @@ # Threshold large enough to guarantee bloom/manifest are skipped for any # test fixture, making WithoutIndex tests deterministic regardless of # fixture size. -_SKIP_INDEX_THRESHOLD = sys.maxsize class TestReorganizationPlannerUtility: @@ -18,12 +15,11 @@ def test_plan_returns_dict(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( + with NativeIndexer( gz_file, index_path, build_bloom=True, build_manifest=True, - index_threshold=0, ) as indexer: indexer.build() groups = [{"name": "posix", "query": 'cat == "POSIX"'}] @@ -38,12 +34,11 @@ def test_call_delegates_to_process(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( + with NativeIndexer( gz_file, index_path, build_bloom=True, build_manifest=True, - index_threshold=0, ) as indexer: indexer.build() util = ReorganizationPlannerUtility() @@ -61,12 +56,11 @@ def test_plan_succeeds_without_manifest(self): with Environment(lines=5) as env: gz_file = env.create_test_gzip_file(bytes_per_line=128) index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( + with NativeIndexer( gz_file, index_path, build_bloom=True, - build_manifest=True, - index_threshold=_SKIP_INDEX_THRESHOLD, + build_manifest=False, ) as indexer: indexer.build() assert not indexer.has_manifest @@ -82,12 +76,11 @@ def test_plan_has_tasks_without_manifest(self): with Environment(lines=5) as env: gz_file = env.create_test_gzip_file(bytes_per_line=128) index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( + with NativeIndexer( gz_file, index_path, build_bloom=True, - build_manifest=True, - index_threshold=_SKIP_INDEX_THRESHOLD, + build_manifest=False, ) as indexer: indexer.build() assert not indexer.has_manifest diff --git a/tests/python/test_statistics_aggregator.py b/tests/python/test_statistics_aggregator.py index 3be995d5..44fb3ad6 100644 --- a/tests/python/test_statistics_aggregator.py +++ b/tests/python/test_statistics_aggregator.py @@ -1,8 +1,6 @@ """Tests for StatisticsAggregatorUtility.""" -import sys - -import dftracer.utils as dft_utils +from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer from dftracer.utils.dftracer_utils_ext import StatisticsAggregatorUtility from .common import Environment @@ -10,7 +8,6 @@ # Threshold large enough to guarantee bloom/manifest are skipped for any # test fixture, making WithoutIndex tests deterministic regardless of # fixture size. -_SKIP_INDEX_THRESHOLD = sys.maxsize class TestStatisticsAggregatorUtility: @@ -18,9 +15,7 @@ def test_compute_returns_dict(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() result = StatisticsAggregatorUtility().process(gz_file) assert isinstance(result, dict) @@ -31,9 +26,7 @@ def test_compute_correct_event_count(self): with Environment(lines=30) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() result = StatisticsAggregatorUtility().process(gz_file) assert result["success"] is True @@ -43,9 +36,7 @@ def test_compute_has_statistics_fields(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() result = StatisticsAggregatorUtility().process(gz_file) assert "num_categories" in result @@ -58,9 +49,7 @@ def test_call_delegates_to_process(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() util = StatisticsAggregatorUtility() result = util(gz_file) @@ -76,11 +65,10 @@ def test_returns_dict_without_bloom(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( + with NativeIndexer( gz_file, index_path, - build_bloom=True, - index_threshold=_SKIP_INDEX_THRESHOLD, + build_bloom=False, ) as indexer: indexer.build() assert not indexer.has_bloom @@ -95,11 +83,10 @@ def test_correct_event_count_without_bloom(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( + with NativeIndexer( gz_file, index_path, - build_bloom=True, - index_threshold=_SKIP_INDEX_THRESHOLD, + build_bloom=False, ) as indexer: indexer.build() assert not indexer.has_bloom @@ -112,11 +99,10 @@ def test_has_statistics_fields_without_bloom(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( + with NativeIndexer( gz_file, index_path, - build_bloom=True, - index_threshold=_SKIP_INDEX_THRESHOLD, + build_bloom=False, ) as indexer: indexer.build() assert not indexer.has_bloom diff --git a/tests/python/test_statistics_query.py b/tests/python/test_statistics_query.py index fa2cb123..55d6d00d 100644 --- a/tests/python/test_statistics_query.py +++ b/tests/python/test_statistics_query.py @@ -1,8 +1,6 @@ """Tests for StatisticsQueryUtility.""" -import sys - -import dftracer.utils as dft_utils +from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer from dftracer.utils.dftracer_utils_ext import StatisticsQueryUtility from .common import Environment @@ -10,7 +8,6 @@ # Threshold large enough to guarantee bloom/manifest are skipped for any # test fixture, making WithoutIndex tests deterministic regardless of # fixture size. -_SKIP_INDEX_THRESHOLD = sys.maxsize class TestStatisticsQueryUtility: @@ -18,9 +15,7 @@ def test_query_summary(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="summary") assert isinstance(result, dict) @@ -31,9 +26,7 @@ def test_query_categories(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="categories") assert "results" in result @@ -43,9 +36,7 @@ def test_query_names(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="names") assert "results" in result @@ -54,9 +45,7 @@ def test_query_top_n_names(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="top_n_names", top_n=5) assert "results" in result @@ -66,9 +55,7 @@ def test_query_duration_stats(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="duration_stats") assert "duration_mean_us" in result @@ -78,9 +65,7 @@ def test_call_delegates_to_process(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( - gz_file, index_path, build_bloom=True, index_threshold=0 - ) as indexer: + with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer: indexer.build() util = StatisticsQueryUtility() result = util(gz_file, query_type="summary") @@ -96,11 +81,10 @@ def test_summary_correct_events_without_bloom(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( + with NativeIndexer( gz_file, index_path, - build_bloom=True, - index_threshold=_SKIP_INDEX_THRESHOLD, + build_bloom=False, ) as indexer: indexer.build() assert not indexer.has_bloom @@ -113,11 +97,10 @@ def test_categories_populated_without_bloom(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer( + with NativeIndexer( gz_file, index_path, - build_bloom=True, - index_threshold=_SKIP_INDEX_THRESHOLD, + build_bloom=False, ) as indexer: indexer.build() assert not indexer.has_bloom diff --git a/tests/python/test_trace_reader.py b/tests/python/test_trace_reader.py index 586fbd7a..8f627886 100644 --- a/tests/python/test_trace_reader.py +++ b/tests/python/test_trace_reader.py @@ -4,6 +4,7 @@ import pytest import dftracer.utils as dft_utils +from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer from .common import Environment @@ -12,11 +13,11 @@ class TestTraceReaderCreation: """Construction and property tests.""" def test_creation_basic(self): - """TraceReader accepts a valid file path and exposes file_path.""" + """TraceReader accepts a valid file path and exposes path.""" with Environment() as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) - assert reader.file_path == gz_file + assert reader.path == gz_file def test_creation_nonexistent_file(self): """TraceReader with nonexistent file creates but read_lines fails.""" @@ -37,7 +38,7 @@ def test_has_index_true_after_indexer_build(self): with Environment() as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer(gz_file, index_path) as indexer: + with NativeIndexer(gz_file, index_path) as indexer: indexer.build() # TraceReader probes for the index store at __init__ time reader = dft_utils.TraceReader(gz_file) @@ -65,12 +66,12 @@ def test_has_index_is_bool(self): reader = dft_utils.TraceReader(gz_file) assert isinstance(reader.has_index, bool) - def test_file_path_is_str(self): - """file_path property returns a str.""" + def test_path_is_str(self): + """path property returns a str.""" with Environment() as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) - assert isinstance(reader.file_path, str) + assert isinstance(reader.path, str) class TestTraceReaderReadLines: @@ -85,13 +86,13 @@ def test_read_all_lines_default_args(self): assert isinstance(lines, list) assert len(lines) == 22 - def test_read_lines_returns_strings(self): - """Every element returned by read_lines() is a str.""" + def test_read_lines_returns_memoryviews(self): + """Every element returned by read_lines() is a memoryview.""" with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) lines = reader.read_lines() - assert all(isinstance(line, str) for line in lines) + assert all(isinstance(line, memoryview) for line in lines) def test_read_lines_content_is_json(self): """Lines contain the JSON fields written by Environment.""" @@ -100,17 +101,19 @@ def test_read_lines_content_is_json(self): reader = dft_utils.TraceReader(gz_file) lines = reader.read_lines() for line in lines: - stripped = line.strip() - if stripped in ("[", "]"): + text = bytes(line).decode("utf-8").strip() + if text in ("[", "]"): continue - assert '"name"' in line + assert b'"name"' in bytes(line) def test_read_lines_explicit_zero_zero(self): """read_lines(0, 0) is equivalent to read_lines().""" with Environment(lines=15) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) - assert reader.read_lines(0, 0) == reader.read_lines() + a = [bytes(m) for m in reader.read_lines(0, 0)] + b = [bytes(m) for m in reader.read_lines()] + assert a == b def test_read_lines_with_range(self): """read_lines(start, end) returns a subset of lines.""" @@ -127,11 +130,10 @@ def test_read_lines_range_is_subset_of_all(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) - all_lines = reader.read_lines() + all_bytes = [bytes(m) for m in reader.read_lines()] partial = reader.read_lines(start_line=3, end_line=8) - # Every line in the partial result must appear in all_lines for line in partial: - assert line in all_lines + assert bytes(line) in all_bytes def test_read_lines_negative_start_raises(self): """read_lines raises ValueError for a negative start_line.""" @@ -154,7 +156,7 @@ def test_read_lines_with_index(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() index_path = env.get_index_path(gz_file) - with dft_utils.Indexer(gz_file, index_path) as indexer: + with NativeIndexer(gz_file, index_path) as indexer: indexer.build() reader = dft_utils.TraceReader(gz_file) assert reader.has_index @@ -166,13 +168,13 @@ def test_read_lines_indexed_matches_sequential(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() # Sequential (no index) - sequential = dft_utils.TraceReader(gz_file).read_lines() + sequential = [bytes(m) for m in dft_utils.TraceReader(gz_file).read_lines()] # Build index, then read again index_path = env.get_index_path(gz_file) - with dft_utils.Indexer(gz_file, index_path) as indexer: + with NativeIndexer(gz_file, index_path) as indexer: indexer.build() - indexed = dft_utils.TraceReader(gz_file).read_lines() + indexed = [bytes(m) for m in dft_utils.TraceReader(gz_file).read_lines()] assert sequential == indexed @@ -226,7 +228,7 @@ def test_with_statement_properties_accessible(self): with Environment() as env: gz_file = env.create_test_gzip_file() with dft_utils.TraceReader(gz_file) as reader: - assert reader.file_path == gz_file + assert reader.path == gz_file assert isinstance(reader.has_index, bool) def test_with_statement_exit_does_not_raise(self): @@ -245,21 +247,14 @@ def test_custom_checkpoint_size_accepted(self): with Environment() as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file, checkpoint_size=1024 * 1024) - assert reader.file_path == gz_file + assert reader.path == gz_file def test_auto_build_index_accepted(self): """auto_build_index kwarg is accepted without error.""" with Environment() as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file, auto_build_index=False) - assert reader.file_path == gz_file - - def test_index_threshold_accepted(self): - """index_threshold kwarg is accepted without error.""" - with Environment() as env: - gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file, index_threshold=16 * 1024 * 1024) - assert reader.file_path == gz_file + assert reader.path == gz_file def test_all_optional_params_together(self): """All optional constructor params can be supplied simultaneously.""" @@ -271,9 +266,8 @@ def test_all_optional_params_together(self): index_dir=env.temp_dir, checkpoint_size=512 * 1024, auto_build_index=False, - index_threshold=4 * 1024 * 1024, ) - assert reader.file_path == gz_file + assert reader.path == gz_file assert reader.index_dir == env.temp_dir @@ -288,12 +282,12 @@ def test_iter_lines_returns_iterator(self): assert hasattr(it, "__iter__") assert hasattr(it, "__next__") - def test_iter_lines_yields_strings(self): + def test_iter_lines_yields_memoryviews(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) for line in reader.iter_lines(): - assert isinstance(line, str) + assert isinstance(line, memoryview) def test_iter_lines_count(self): with Environment(lines=20) as env: @@ -306,8 +300,8 @@ def test_iter_lines_matches_read_lines(self): with Environment(lines=15) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) - from_iter = list(reader.iter_lines()) - from_read = reader.read_lines() + from_iter = [bytes(m) for m in reader.iter_lines()] + from_read = [bytes(m) for m in reader.read_lines()] assert from_iter == from_read def test_iter_lines_with_range(self): @@ -356,12 +350,12 @@ def test_iter_raw_returns_iterator(self): assert hasattr(it, "__iter__") assert hasattr(it, "__next__") - def test_iter_raw_yields_bytes(self): + def test_iter_raw_yields_memoryviews(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) for chunk in reader.iter_raw(): - assert isinstance(chunk, bytes) + assert isinstance(chunk, memoryview) def test_iter_raw_single_line_mode(self): """multi_line=False yields one chunk per line.""" @@ -402,20 +396,20 @@ def test_iter_raw_negative_raises(self): class TestTraceReaderReadRaw: """read_raw() materialized list tests.""" - def test_read_raw_returns_list_of_bytes(self): + def test_read_raw_returns_list_of_memoryviews(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) chunks = reader.read_raw() assert isinstance(chunks, list) - assert all(isinstance(c, bytes) for c in chunks) + assert all(isinstance(c, memoryview) for c in chunks) def test_read_raw_matches_iter_raw(self): with Environment(lines=15) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) - from_read = reader.read_raw() - from_iter = list(reader.iter_raw()) + from_read = [bytes(m) for m in reader.read_raw()] + from_iter = [bytes(m) for m in reader.iter_raw()] assert from_read == from_iter def test_read_raw_single_line_count(self): @@ -457,10 +451,10 @@ def test_default_runtime_works(self): class TestTraceReaderJSON: - """JSON reading tests.""" + """JSON reading tests (shimmed via Arrow).""" def test_read_lines_json_returns_list(self): - """read_lines_json returns a list of JSON objects.""" + """read_lines_json returns a list of dicts.""" with Environment(lines=32) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) @@ -469,25 +463,25 @@ def test_read_lines_json_returns_list(self): assert len(result) == 32 def test_read_lines_json_objects_have_keys(self): - """Each JSON object has expected keys.""" + """Each dict has expected keys.""" with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) result = reader.read_lines_json() for obj in result: + assert isinstance(obj, dict) assert "name" in obj assert "cat" in obj assert "dur" in obj def test_read_lines_json_values_correct(self): - """JSON values match what was written.""" + """Dict values match what was written.""" with Environment(lines=5) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) result = reader.read_lines_json() assert result[0]["name"] == "write" assert result[0]["cat"] == "POSIX" - assert result[0]["ph"] == "X" def test_iter_lines_json_is_lazy(self): """iter_lines_json returns an iterator, not a list.""" @@ -508,7 +502,6 @@ def test_iter_lines_json_partial_iteration(self): it = reader.iter_lines_json() first = next(it) assert "name" in first - # Don't exhaust the iterator def test_read_lines_json_with_line_range(self): """read_lines_json respects start_line/end_line.""" @@ -516,8 +509,6 @@ def test_read_lines_json_with_line_range(self): gz_file = env.create_test_gzip_file() env.build_index(gz_file) reader = dft_utils.TraceReader(gz_file) - # Lines are 1-indexed; line 1 is "[", line 2 is first JSON, etc. - # But iter_lines_json skips non-JSON lines, so we get JSON objects all_json = reader.read_lines_json() subset = reader.read_lines_json(start_line=1, end_line=10) assert len(subset) <= len(all_json) @@ -579,7 +570,6 @@ def test_num_lines_property_still_works(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) - # Property should still work (falls back to reading all lines) assert reader.num_lines > 0 @@ -592,7 +582,6 @@ def test_end_line_beyond_total_clamped(self): gz_file = env.create_test_gzip_file() env.build_index(gz_file) reader = dft_utils.TraceReader(gz_file) - # Request way more lines than exist result = reader.read_lines(start_line=1, end_line=99999) assert len(result) > 0 @@ -690,7 +679,7 @@ def test_query_by_name(self): filtered = reader.read_lines(query='name == "read"') assert len(filtered) > 0 for line in filtered: - assert '"name":"read"' in line + assert b'"name":"read"' in bytes(line) def test_query_and(self): with Environment() as env: @@ -732,7 +721,7 @@ def test_iter_lines_with_query(self): lines = list(reader.iter_lines(query='name == "write"')) assert len(lines) > 0 for line in lines: - assert '"name":"write"' in line + assert b'"name":"write"' in bytes(line) def test_iter_lines_json_with_query(self): with Environment() as env: @@ -756,8 +745,8 @@ def test_query_with_field_class(self): filtered = reader.read_lines(query=str(q)) assert len(filtered) > 0 for line in filtered: - assert '"cat":"IO"' in line - assert '"name":"read"' in line + assert b'"cat":"IO"' in bytes(line) + assert b'"name":"read"' in bytes(line) if __name__ == "__main__": diff --git a/tests/python/test_trace_reader_arrow.py b/tests/python/test_trace_reader_arrow.py index 822554c5..b6bcb21a 100644 --- a/tests/python/test_trace_reader_arrow.py +++ b/tests/python/test_trace_reader_arrow.py @@ -3,6 +3,7 @@ import dftracer.utils as dft_utils from dftracer.utils.arrow import ArrowBatch, ArrowTable +from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer from .common import Environment @@ -69,7 +70,7 @@ def test_iter_arrow_with_line_range(self): gz_file = env.create_test_gzip_file() # Build index for line-based access index_path = env.get_index_path(gz_file) - with dft_utils.Indexer(gz_file, index_path) as indexer: + with NativeIndexer(gz_file, index_path) as indexer: indexer.build() with dft_utils.TraceReader(gz_file) as reader: batches = list(reader.iter_arrow(start_line=10, end_line=20, batch_size=100)) @@ -120,6 +121,265 @@ def test_read_arrow_properties(self): assert not table.empty +class TestIterArrowStream: + """Tests for TraceReader.iter_arrow_stream().""" + + def test_iter_arrow_stream_exposes_c_stream(self): + """iter_arrow_stream returns an object with __arrow_c_stream__.""" + with Environment(lines=20) as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=100) + assert hasattr(stream, "__arrow_c_stream__") + + def test_iter_arrow_stream_row_count_matches_iter_arrow(self): + """Stream drains the same row count as the per-batch iterator.""" + import pyarrow as pa + + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + batches_expected = list(reader.iter_arrow(batch_size=20)) + expected_rows = sum(b.num_rows for b in batches_expected) + + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=20) + rbr = pa.RecordBatchReader.from_stream(stream) + total = sum(b.num_rows for b in rbr) + assert total == expected_rows + assert total == 50 + + def test_iter_arrow_stream_schema_matches_iter_arrow(self): + """Stream schema equals iter_arrow's schema plus the _extra catch-all column.""" + import pyarrow as pa + + with Environment(lines=20) as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + expected_batch = pa.record_batch(next(iter(reader.iter_arrow(batch_size=100)))) + expected_names = list(expected_batch.schema.names) + + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=100) + rbr = pa.RecordBatchReader.from_stream(stream) + stream_names = list(rbr.schema.names) + assert stream_names[-1] == "_extra" + assert set(stream_names[:-1]) == set(expected_names) + + def test_iter_arrow_stream_pa_table(self): + """pa.table(stream) materializes a full Table.""" + import pyarrow as pa + + with Environment(lines=40) as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=10) + table = pa.table(stream) + assert isinstance(table, pa.Table) + assert table.num_rows == 40 + + def test_iter_arrow_stream_single_use(self): + """__arrow_c_stream__ can only be consumed once.""" + with Environment(lines=5) as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=100) + stream.__arrow_c_stream__() + try: + stream.__arrow_c_stream__() + raise AssertionError("expected RuntimeError on second consume") + except RuntimeError: + pass + + def test_iter_arrow_stream_survives_early_drop(self): + """Dropping an in-flight batch must not double-free the stream.""" + import gc + + import pyarrow as pa + + with Environment(lines=30) as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=5) + rbr = pa.RecordBatchReader.from_stream(stream) + first = next(iter(rbr)) + assert first.num_rows > 0 + # Drop the first batch while stream is still live. + del first + gc.collect() + total = sum(b.num_rows for b in rbr) + assert total >= 0 + + def test_read_arrow_uses_stream_path(self): + """read_arrow still produces a correct ArrowTable via the stream.""" + with Environment(lines=30) as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + table = reader.read_arrow(batch_size=10) + assert isinstance(table, ArrowTable) + assert table.num_rows == 30 + + +class TestIterArrowStreamReconciliation: + """Stream emits a single locked schema across batches with diverging columns.""" + + @staticmethod + def _write_trace(path, rows): + import gzip + import json + import os + + os.makedirs(os.path.dirname(path), exist_ok=True) + with gzip.open(path, "wt", encoding="utf-8") as f: + for r in rows: + f.write(json.dumps(r) + "\n") + + def _make_divergent_dir(self, env): + """Directory whose top-level keys differ across files. `args` is + serialized as a single JSON string column by the Arrow builder, so + divergence has to be at the top level to reach the reconciler.""" + import os + + base = os.path.join(env.temp_dir, "divergent") + common = {"name": "read", "cat": "POSIX", "pid": 1, "tid": 1, "dur": 10, "ph": "X"} + rows_a = [{**common, "ts": i, "only_a_int": i * 3} for i in range(30)] + rows_b = [{**common, "ts": i, "only_b_str": f"b-{i}"} for i in range(30)] + rows_c = [{**common, "ts": i, "only_c_dbl": float(i) * 0.5} for i in range(30)] + self._write_trace(os.path.join(base, "a.pfw.gz"), rows_a) + self._write_trace(os.path.join(base, "b.pfw.gz"), rows_b) + self._write_trace(os.path.join(base, "c.pfw.gz"), rows_c) + for f in ("a.pfw.gz", "b.pfw.gz", "c.pfw.gz"): + gz = os.path.join(base, f) + env.test_files.append(gz) + idx = env.get_index_path(gz) + with NativeIndexer(gz, idx) as indexer: + indexer.build() + return base + + def test_stream_schema_has_extra_column(self): + import pyarrow as pa + + with Environment() as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=100) + rbr = pa.RecordBatchReader.from_stream(stream) + assert "_extra" in rbr.schema.names + + def test_stream_survives_divergent_schemas(self): + """Directory-mode stream with differing args shapes must not error.""" + import pyarrow as pa + + with Environment() as env: + data_dir = self._make_divergent_dir(env) + with dft_utils.TraceReader(data_dir) as reader: + stream = reader.iter_arrow_stream(batch_size=25) + rbr = pa.RecordBatchReader.from_stream(stream) + table = rbr.read_all() + assert table.num_rows == 90 + + def test_stream_preserves_all_column_data(self): + """Each file's unique column ends up either as a native column (with + nulls for the other files) or in _extra JSON. No data is lost.""" + import pyarrow as pa + + with Environment() as env: + data_dir = self._make_divergent_dir(env) + with dft_utils.TraceReader(data_dir) as reader: + stream = reader.iter_arrow_stream(batch_size=25) + rbr = pa.RecordBatchReader.from_stream(stream) + table = rbr.read_all() + + names = set(table.schema.names) + assert "_extra" in names + + def hits_for(colname): + if colname in names: + return sum(1 for v in table.column(colname).to_pylist() if v is not None) + extras = table.column("_extra").to_pylist() + return sum(1 for e in extras if e and colname in e) + + # Each file's unique column must appear 30 times, either natively + # or via _extra — the reconciler preserves every value. + assert hits_for("only_a_int") == 30 + assert hits_for("only_b_str") == 30 + assert hits_for("only_c_dbl") == 30 + + def test_stream_matches_pa_table_from_stream(self): + """pa.table(stream) yields the same row count as RecordBatchReader.read_all.""" + import pyarrow as pa + + with Environment() as env: + data_dir = self._make_divergent_dir(env) + with dft_utils.TraceReader(data_dir) as reader: + stream = reader.iter_arrow_stream(batch_size=25) + table = pa.table(stream) + assert table.num_rows == 90 + + def test_stream_empty_result_has_schema(self): + """Empty stream still exposes a schema with _extra so callers don't crash.""" + import pyarrow as pa + + with Environment(lines=10) as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=100, query="pid == 99999999") + rbr = pa.RecordBatchReader.from_stream(stream) + assert "_extra" in rbr.schema.names + total = sum(b.num_rows for b in rbr) + assert total == 0 + + def test_stream_flatten_promotes_nested_keys(self): + """flatten_objects=True expands top-level object values one level.""" + import pyarrow as pa + + with Environment(lines=20) as env: + gz_file = env.create_dft_trace_file() + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=100, flatten_objects=True) + rbr = pa.RecordBatchReader.from_stream(stream) + table = rbr.read_all() + names = set(table.schema.names) + # args.ret and args.hhash should be promoted to native typed columns. + assert "args.ret" in names + assert "args.hhash" in names + # Native type survives the reconciler; values must round-trip. + rets = table.column("args.ret").to_pylist() + assert all(isinstance(v, int) for v in rets if v is not None) + assert rets[0] == 1024 + + def test_stream_no_flatten_keeps_args_as_json(self): + """flatten_objects=False leaves `args` as a single JSON string column.""" + import pyarrow as pa + + with Environment(lines=10) as env: + gz_file = env.create_dft_trace_file() + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=100, flatten_objects=False) + rbr = pa.RecordBatchReader.from_stream(stream) + table = rbr.read_all() + names = set(table.schema.names) + assert "args" in names + assert "args.ret" not in names + first = table.column("args").to_pylist()[0] + assert isinstance(first, str) + assert first.startswith("{") and "ret" in first + + def test_stream_extra_is_null_when_no_divergence(self): + """_extra should be all-null when every batch matches the discovered schema.""" + import pyarrow as pa + + with Environment(lines=40) as env: + gz_file = env.create_test_gzip_file() + with dft_utils.TraceReader(gz_file) as reader: + stream = reader.iter_arrow_stream(batch_size=10) + rbr = pa.RecordBatchReader.from_stream(stream) + table = rbr.read_all() + assert table.num_rows == 40 + extra = table.column("_extra") + assert extra.null_count == extra.length() + + class TestArrowBatchWrapper: """Tests for the ArrowBatch Python wrapper.""" diff --git a/tests/python/test_trace_reader_directory.py b/tests/python/test_trace_reader_directory.py new file mode 100644 index 00000000..3481d6fb --- /dev/null +++ b/tests/python/test_trace_reader_directory.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +"""Test cases for directory-level parallel TraceReader (iter_arrow / read_arrow).""" + +import os + +import dftracer.utils as dft_utils +from dftracer.utils.arrow import ArrowTable + +from .common import Environment + + +def _create_directory_with_files(env, num_files=3, lines_per_file=20, nested=False): + """Create multiple .pfw.gz files in env.temp_dir, optionally in subdirectories.""" + files = [] + for i in range(num_files): + if nested: + subdir = f"rank_{i}" + filename = os.path.join(subdir, f"trace_{i}.pfw.gz") + else: + filename = f"trace_{i}.pfw.gz" + f = env.create_dft_trace_file(filename=filename, num_events=lines_per_file) + files.append(f) + return files + + +class TestDirectoryIterArrow: + """Tests for TraceReader.iter_arrow() with a directory path.""" + + def test_iter_arrow_directory_returns_batches(self): + """iter_arrow on a directory yields Arrow batches from all files.""" + with Environment(lines=20) as env: + _create_directory_with_files(env, num_files=3, lines_per_file=20) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + batches = list(reader.iter_arrow(batch_size=100)) + rt.shutdown() + assert len(batches) >= 1 + total_rows = sum(b.num_rows for b in batches) + assert total_rows == 60 + + def test_iter_arrow_directory_single_file(self): + """Directory with one file produces same results as single-file path.""" + with Environment(lines=30) as env: + files = _create_directory_with_files(env, num_files=1, lines_per_file=30) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as dir_reader: + dir_batches = list(dir_reader.iter_arrow(batch_size=100)) + with dft_utils.TraceReader(files[0], runtime=rt) as file_reader: + file_batches = list(file_reader.iter_arrow(batch_size=100)) + rt.shutdown() + dir_rows = sum(b.num_rows for b in dir_batches) + file_rows = sum(b.num_rows for b in file_batches) + assert dir_rows == file_rows == 30 + + def test_iter_arrow_directory_nested_subdirs(self): + """iter_arrow discovers .pfw.gz files in nested subdirectories.""" + with Environment(lines=10) as env: + _create_directory_with_files(env, num_files=4, lines_per_file=10, nested=True) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + batches = list(reader.iter_arrow(batch_size=100)) + rt.shutdown() + total_rows = sum(b.num_rows for b in batches) + assert total_rows == 40 + + def test_iter_arrow_directory_batch_size(self): + """Batch size is respected when reading from a directory.""" + with Environment(lines=25) as env: + _create_directory_with_files(env, num_files=3, lines_per_file=25) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + batches = list(reader.iter_arrow(batch_size=10)) + rt.shutdown() + for b in batches: + assert b.num_rows <= 10 + total_rows = sum(b.num_rows for b in batches) + assert total_rows == 75 + + def test_iter_arrow_directory_empty(self): + """Directory with no .pfw.gz files yields no batches.""" + with Environment(lines=10) as env: + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + batches = list(reader.iter_arrow(batch_size=100)) + rt.shutdown() + assert len(batches) == 0 + + +class TestDirectoryReadArrow: + """Tests for TraceReader.read_arrow() with a directory path.""" + + def test_read_arrow_directory(self): + """read_arrow on a directory returns an ArrowTable with all rows.""" + with Environment(lines=15) as env: + _create_directory_with_files(env, num_files=4, lines_per_file=15) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + table = reader.read_arrow(batch_size=100) + rt.shutdown() + assert isinstance(table, ArrowTable) + assert table.num_rows == 60 + + def test_read_arrow_directory_properties(self): + """ArrowTable from directory has correct properties.""" + with Environment(lines=10) as env: + _create_directory_with_files(env, num_files=2, lines_per_file=10) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + table = reader.read_arrow(batch_size=100) + rt.shutdown() + assert table.num_rows == 20 + assert table.num_batches >= 1 + assert not table.empty + + +class TestDirectoryWithQuery: + """Tests for directory reading with query filtering.""" + + def test_directory_query_filters_events(self): + """Query filtering works across directory files.""" + with Environment(lines=50) as env: + _create_directory_with_files(env, num_files=3, lines_per_file=50) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + table_all = reader.read_arrow(batch_size=1000) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + table_filtered = reader.read_arrow(batch_size=1000, query='name == "read"') + rt.shutdown() + assert table_all.num_rows == 150 + assert table_filtered.num_rows > 0 + assert table_filtered.num_rows < table_all.num_rows + + def test_directory_query_no_match(self): + """Query that matches nothing returns empty table.""" + with Environment(lines=20) as env: + _create_directory_with_files(env, num_files=2, lines_per_file=20) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + table = reader.read_arrow(batch_size=100, query='name == "nonexistent_op"') + rt.shutdown() + assert table.num_rows == 0 + + +class TestDirectoryIterJson: + """Tests for TraceReader.iter_json() with a directory path.""" + + def test_iter_json_directory_returns_events(self): + """iter_json on a directory yields JsonDictValue events from all files.""" + with Environment(lines=20) as env: + _create_directory_with_files(env, num_files=3, lines_per_file=20) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + events = list(reader.iter_json()) + rt.shutdown() + assert len(events) == 60 + for ev in events: + assert "name" in ev + + def test_iter_json_directory_single_file(self): + """Directory with one file matches single-file iter_json.""" + with Environment(lines=30) as env: + files = _create_directory_with_files(env, num_files=1, lines_per_file=30) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as dir_reader: + dir_events = list(dir_reader.iter_json()) + with dft_utils.TraceReader(files[0], runtime=rt) as file_reader: + file_events = list(file_reader.iter_json()) + rt.shutdown() + assert len(dir_events) == len(file_events) == 30 + + def test_iter_json_directory_nested_subdirs(self): + """iter_json discovers .pfw.gz files in nested subdirectories.""" + with Environment(lines=10) as env: + _create_directory_with_files(env, num_files=4, lines_per_file=10, nested=True) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + events = list(reader.iter_json()) + rt.shutdown() + assert len(events) == 40 + + def test_iter_json_directory_empty(self): + """Directory with no .pfw.gz files yields no events.""" + with Environment(lines=10) as env: + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + events = list(reader.iter_json()) + rt.shutdown() + assert len(events) == 0 + + def test_read_json_directory(self): + """read_json on a directory returns all events.""" + with Environment(lines=15) as env: + _create_directory_with_files(env, num_files=4, lines_per_file=15) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + events = reader.read_json() + rt.shutdown() + assert len(events) == 60 + + def test_iter_json_directory_to_dict(self): + """JsonDictValue.to_dict() works for directory-sourced events.""" + with Environment(lines=10) as env: + _create_directory_with_files(env, num_files=2, lines_per_file=10) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + events = list(reader.iter_json()) + rt.shutdown() + for ev in events: + d = ev.to_dict() + assert isinstance(d, dict) + assert "name" in d + + +class TestDirectoryIterLines: + """Tests for TraceReader.iter_lines() with a directory path.""" + + def test_iter_lines_directory_returns_lines(self): + """iter_lines on a directory yields memoryview lines from all files.""" + with Environment(lines=20) as env: + _create_directory_with_files(env, num_files=3, lines_per_file=20) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + lines = list(reader.iter_lines()) + rt.shutdown() + assert len(lines) == 60 + for line in lines: + assert isinstance(line, memoryview) + + def test_read_lines_directory(self): + """read_lines on a directory returns all lines.""" + with Environment(lines=15) as env: + _create_directory_with_files(env, num_files=4, lines_per_file=15) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + lines = reader.read_lines() + rt.shutdown() + assert len(lines) == 60 + + def test_iter_lines_directory_empty(self): + """Directory with no .pfw.gz files yields no lines.""" + with Environment(lines=10) as env: + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + lines = list(reader.iter_lines()) + rt.shutdown() + assert len(lines) == 0 + + +class TestDirectoryIterRaw: + """Tests for TraceReader.iter_raw() with a directory path.""" + + def test_iter_raw_directory_returns_chunks(self): + """iter_raw on a directory yields memoryview chunks from all files.""" + with Environment(lines=20) as env: + _create_directory_with_files(env, num_files=3, lines_per_file=20) + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + chunks = list(reader.iter_raw()) + rt.shutdown() + assert len(chunks) >= 1 + for chunk in chunks: + assert isinstance(chunk, memoryview) + assert len(chunk) > 0 + + def test_iter_raw_directory_empty(self): + """Directory with no .pfw.gz files yields no chunks.""" + with Environment(lines=10) as env: + rt = dft_utils.Runtime(threads=2) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + chunks = list(reader.iter_raw()) + rt.shutdown() + assert len(chunks) == 0 + + +class TestDirectoryMultiThreaded: + """Tests for directory reading with various thread counts.""" + + def test_directory_single_thread(self): + """Directory reading works with single thread.""" + with Environment(lines=20) as env: + _create_directory_with_files(env, num_files=3, lines_per_file=20) + rt = dft_utils.Runtime(threads=1) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + table = reader.read_arrow(batch_size=100) + rt.shutdown() + assert table.num_rows == 60 + + def test_directory_many_threads(self): + """Directory reading works with more threads than files.""" + with Environment(lines=10) as env: + _create_directory_with_files(env, num_files=2, lines_per_file=10) + rt = dft_utils.Runtime(threads=8) + with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader: + table = reader.read_arrow(batch_size=100) + rt.shutdown() + assert table.num_rows == 20 diff --git a/tests/python/test_trace_reader_write_arrow.py b/tests/python/test_trace_reader_write_arrow.py new file mode 100644 index 00000000..5e8645ba --- /dev/null +++ b/tests/python/test_trace_reader_write_arrow.py @@ -0,0 +1,490 @@ +"""Tests for TraceReader.write_arrow with bloom filter pruning.""" + +import os +import tempfile + +import pyarrow as pa +import pyarrow.ipc as ipc +import pytest + +import dftracer.utils as dft_utils + +from .common import Environment + + +class TestTraceReaderWriteArrow: + """Test TraceReader.write_arrow functionality.""" + + def test_write_arrow_basic(self): + """Basic write_arrow produces readable Arrow IPC files.""" + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + result = reader.write_arrow(output_dir) + + assert "partitions" in result + assert "total_rows" in result + assert "total_bytes" in result + assert "chunks_scanned" in result + assert "chunks_skipped" in result + + assert result["total_rows"] > 0 + + for view_name, stats in result["partitions"].items(): + assert "files" in stats + assert "rows" in stats + assert len(stats["files"]) > 0 + + for arrow_file in stats["files"]: + assert os.path.exists(arrow_file) + reader_ipc = ipc.open_file(arrow_file) + table = reader_ipc.read_all() + assert table.num_rows > 0 + + def test_write_arrow_predefined_views(self): + """Test predefined views (io) - view may filter some events.""" + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + result = reader.write_arrow(output_dir, views=["io"]) + + assert "io" in result["partitions"] + stats = result["partitions"]["io"] + + if stats["rows"] > 0: + for arrow_file in stats["files"]: + reader_ipc = ipc.open_file(arrow_file) + table = reader_ipc.read_all() + assert table.num_rows > 0 + cats = table.column("cat").to_pylist() + for cat in cats: + assert cat in ["POSIX", "STDIO"] + + def test_write_arrow_custom_query(self): + """Test custom query view.""" + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + views = [{"name": "reads", "query": 'name == "read"'}] + result = reader.write_arrow(output_dir, views=views) + + assert "reads" in result["partitions"] + stats = result["partitions"]["reads"] + + if stats["rows"] > 0: + for arrow_file in stats["files"]: + reader_ipc = ipc.open_file(arrow_file) + table = reader_ipc.read_all() + names = table.column("name").to_pylist() + for name in names: + assert name == "read" + + def test_write_arrow_bloom_filter_pruning(self): + """Verify bloom filter pruning returns stats.""" + with Environment(lines=100) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + views = [{"name": "posix_only", "query": 'cat == "POSIX"'}] + result = reader.write_arrow(output_dir, views=views) + + assert "chunks_scanned" in result + assert "chunks_skipped" in result + assert result["chunks_scanned"] >= 0 + assert result["chunks_skipped"] >= 0 + + def test_write_arrow_multiple_views(self): + """Test multiple views in single call.""" + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + views = [ + {"name": "reads", "query": 'name == "read"'}, + {"name": "writes", "query": 'name == "write"'}, + ] + result = reader.write_arrow(output_dir, views=views) + + assert "reads" in result["partitions"] + assert "writes" in result["partitions"] + + reads_dir = os.path.join(output_dir, "reads") + writes_dir = os.path.join(output_dir, "writes") + assert os.path.isdir(reads_dir) + assert os.path.isdir(writes_dir) + + def test_write_arrow_compression(self): + """Test different compression options.""" + with Environment(lines=30) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + result_zstd = reader.write_arrow( + os.path.join(output_dir, "zstd"), compression="zstd" + ) + result_none = reader.write_arrow( + os.path.join(output_dir, "none"), compression="none" + ) + + assert result_zstd["total_rows"] == result_none["total_rows"] + + zstd_files = result_zstd["partitions"]["all"]["files"] + none_files = result_none["partitions"]["all"]["files"] + + zstd_size = sum(os.path.getsize(f) for f in zstd_files) + none_size = sum(os.path.getsize(f) for f in none_files) + + assert zstd_size < none_size + + def test_write_arrow_chunk_size(self): + """Test chunk_size_mb controls file splitting.""" + with Environment(lines=100) as env: + gz_file = env.create_test_gzip_file(bytes_per_line=4096) + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + result = reader.write_arrow(output_dir, chunk_size_mb=0) + + stats = result["partitions"]["all"] + assert len(stats["files"]) == 1 + + def test_write_arrow_no_metadata(self): + """Test include_metadata=False excludes metadata events.""" + with Environment(lines=30) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + views = [{"name": "no_meta", "query": 'cat == "POSIX"', "include_metadata": False}] + result = reader.write_arrow(output_dir, views=views) + + assert "no_meta" in result["partitions"] + + +class TestElasticArrowSchema: + """Test elastic Arrow schema with varying event fields.""" + + def test_varying_schema_single_file(self): + """Events with different fields produce consistent Arrow schema.""" + with Environment(lines=500) as env: + gz_file = env.create_varying_schema_file() + env.build_index(gz_file, checkpoint_size_bytes=4 * 1024) + reader = dft_utils.TraceReader(gz_file, checkpoint_size=4 * 1024) + + with tempfile.TemporaryDirectory() as output_dir: + result = reader.write_arrow(output_dir) + + assert result["total_rows"] > 0 + stats = result["partitions"]["all"] + assert len(stats["files"]) >= 1 + + schemas = [] + for arrow_file in stats["files"]: + reader_ipc = ipc.open_file(arrow_file) + schemas.append(reader_ipc.schema) + + if len(schemas) > 1: + first_schema = schemas[0] + for i, schema in enumerate(schemas[1:], 1): + assert schema.equals(first_schema), ( + f"Schema mismatch between file 0 and file {i}" + ) + + def test_varying_schema_column_order_stable(self): + """Column order remains consistent across batches.""" + with Environment(lines=1000) as env: + gz_file = env.create_varying_schema_file(num_events=1000) + env.build_index(gz_file, checkpoint_size_bytes=2 * 1024) + reader = dft_utils.TraceReader(gz_file, checkpoint_size=2 * 1024) + + with tempfile.TemporaryDirectory() as output_dir: + result = reader.write_arrow(output_dir, batch_size=100) + + assert result["total_rows"] > 0 + stats = result["partitions"]["all"] + + all_tables = [] + for arrow_file in stats["files"]: + reader_ipc = ipc.open_file(arrow_file) + all_tables.append(reader_ipc.read_all()) + + if len(all_tables) > 1: + first_columns = all_tables[0].column_names + for i, table in enumerate(all_tables[1:], 1): + assert table.column_names == first_columns, ( + f"Column order mismatch between table 0 and table {i}" + ) + + def test_varying_schema_null_backfill(self): + """Fields not present in all events are backfilled with nulls.""" + with Environment(lines=500) as env: + gz_file = env.create_varying_schema_file() + env.build_index(gz_file, checkpoint_size_bytes=4 * 1024) + reader = dft_utils.TraceReader(gz_file, checkpoint_size=4 * 1024) + + with tempfile.TemporaryDirectory() as output_dir: + result = reader.write_arrow(output_dir) + + stats = result["partitions"]["all"] + tables = [ipc.open_file(f).read_all() for f in stats["files"]] + combined = pa.concat_tables(tables) + + if "rare_field" in combined.column_names: + rare_col = combined.column("rare_field") + null_count = rare_col.null_count + assert null_count > 0, "rare_field should have null values" + assert null_count < len(rare_col), "rare_field should have some non-null values" + + def test_varying_schema_pyarrow_concat(self): + """Multiple IPC files can be concatenated with pyarrow.""" + with Environment(lines=1000) as env: + gz_file = env.create_varying_schema_file(num_events=1000) + env.build_index(gz_file, checkpoint_size_bytes=2 * 1024) + reader = dft_utils.TraceReader(gz_file, checkpoint_size=2 * 1024) + + with tempfile.TemporaryDirectory() as output_dir: + result = reader.write_arrow(output_dir) + + stats = result["partitions"]["all"] + if len(stats["files"]) > 1: + tables = [ipc.open_file(f).read_all() for f in stats["files"]] + combined = pa.concat_tables(tables) + assert combined.num_rows == result["total_rows"] + + +class TestTraceReaderWriteArrowDask: + """Test write_arrow integration with Dask.""" + + def test_write_arrow_dask_read(self): + """Verify Arrow output is readable by Dask.""" + pytest.importorskip("dask") + pytest.importorskip("dask.dataframe") + import dask.dataframe as dd + + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + result = reader.write_arrow(output_dir) + + arrow_files = result["partitions"]["all"]["files"] + assert len(arrow_files) > 0 + + tables = [] + for f in arrow_files: + reader_ipc = ipc.open_file(f) + tables.append(reader_ipc.read_all()) + + combined = pa.concat_tables(tables) + pdf = combined.to_pandas() + + ddf = dd.from_pandas(pdf, npartitions=2) + assert len(ddf) == result["total_rows"] + + def test_write_arrow_parallel_views_dask(self): + """Test reading multiple view outputs with Dask.""" + pytest.importorskip("dask") + pytest.importorskip("dask.dataframe") + import dask.dataframe as dd + + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + views = [ + {"name": "posix", "query": 'cat == "POSIX"'}, + {"name": "stdio", "query": 'cat == "STDIO"'}, + ] + result = reader.write_arrow(output_dir, views=views) + + for view_name in ["posix", "stdio"]: + if result["partitions"][view_name]["rows"] > 0: + files = result["partitions"][view_name]["files"] + tables = [ipc.open_file(f).read_all() for f in files] + combined = pa.concat_tables(tables) + ddf = dd.from_pandas(combined.to_pandas(), npartitions=1) + assert len(ddf) == result["partitions"][view_name]["rows"] + + +class TestTraceReaderViewChunks: + """Test get_view_chunks and write_view_chunk APIs.""" + + def test_get_view_chunks_basic(self): + """Test get_view_chunks returns chunk metadata.""" + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + result = reader.get_view_chunks(view={"name": "all", "query": 'cat == "POSIX"'}) + + assert "chunks" in result + assert "total_checkpoints" in result + assert "skipped_checkpoints" in result + assert "file_may_match" in result + assert result["total_checkpoints"] >= 0 + + def test_write_view_chunk_basic(self): + """Test write_view_chunk writes Arrow IPC file.""" + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + reader = dft_utils.TraceReader(gz_file) + + chunks_result = reader.get_view_chunks() + if not chunks_result["chunks"]: + pytest.skip("No chunks to process") + + chunk = chunks_result["chunks"][0] + + with tempfile.TemporaryDirectory() as output_dir: + output_file = os.path.join(output_dir, "chunk-00000.arrow") + result = reader.write_view_chunk( + output_file=output_file, + checkpoint_idx=chunk["checkpoint_idx"], + start_byte=chunk["start_byte"], + end_byte=chunk["end_byte"], + ) + + assert "output_file" in result + assert "rows_written" in result + assert os.path.exists(result["output_file"]) + + if result["rows_written"] > 0: + reader_ipc = ipc.open_file(result["output_file"]) + table = reader_ipc.read_all() + assert table.num_rows == result["rows_written"] + + def test_write_view_chunks_parallel(self): + """Test write_view_chunks processes multiple chunks in parallel.""" + with Environment(lines=5000) as env: + gz_file = env.create_test_gzip_file(bytes_per_line=512) + env.build_index(gz_file, checkpoint_size_bytes=4 * 1024) + reader = dft_utils.TraceReader(gz_file, checkpoint_size=4 * 1024) + + chunks_result = reader.get_view_chunks() + if len(chunks_result["chunks"]) < 2: + pytest.skip("Need at least 2 chunks for parallel test") + + chunks = chunks_result["chunks"][:4] + + with tempfile.TemporaryDirectory() as output_dir: + result = reader.write_view_chunks( + chunks=chunks, + output_dir=output_dir, + ) + + assert "results" in result + assert "total_rows" in result + assert "total_events_matched" in result + + assert len(result["results"]) == len(chunks) + + total_rows = 0 + for r in result["results"]: + assert "output_file" in r + assert "rows_written" in r + if r["rows_written"] > 0: + assert os.path.exists(r["output_file"]) + reader_ipc = ipc.open_file(r["output_file"]) + table = reader_ipc.read_all() + assert table.num_rows == r["rows_written"] + total_rows += r["rows_written"] + + assert result["total_rows"] == total_rows + + +class TestDistributedWriteArrow: + """Test distributed_write_arrow with Dask.""" + + def test_distributed_write_arrow_basic(self): + """Test distributed_write_arrow produces readable files.""" + pytest.importorskip("dask") + from dftracer.utils.arrow import read_arrow + from dftracer.utils.dask import distributed_write_arrow + + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + result = distributed_write_arrow( + gz_file, output_dir, view={"name": "all", "query": 'cat == "POSIX"'} + ) + + assert "files" in result + assert "total_chunks" in result + assert "skipped_chunks" in result + assert "total_rows" in result + + if result["files"]: + table = read_arrow(result["files"]) + assert table is not None + assert table.num_rows == result["total_rows"] + + def test_distributed_write_arrow_with_view(self): + """Test distributed_write_arrow with predefined view.""" + pytest.importorskip("dask") + from dftracer.utils.arrow import read_arrow + from dftracer.utils.dask import distributed_write_arrow + + with Environment(lines=50) as env: + gz_file = env.create_test_gzip_file() + env.build_index(gz_file) + + with tempfile.TemporaryDirectory() as output_dir: + result = distributed_write_arrow(gz_file, output_dir) + + assert "files" in result + if result["files"]: + table = read_arrow(result["files"]) + assert table is not None + + def test_distributed_write_arrow_batched(self): + """Test distributed_write_arrow with chunks_per_task batching.""" + pytest.importorskip("dask") + from dftracer.utils.arrow import read_arrow + from dftracer.utils.dask import distributed_write_arrow + + with Environment(lines=5000) as env: + gz_file = env.create_test_gzip_file(bytes_per_line=512) + env.build_index(gz_file, checkpoint_size_bytes=4 * 1024) + + with tempfile.TemporaryDirectory() as output_dir: + result = distributed_write_arrow( + gz_file, + output_dir, + view={"name": "all", "query": 'cat == "POSIX"'}, + checkpoint_size=4 * 1024, + chunks_per_task=2, + ) + + assert "files" in result + assert "total_chunks" in result + assert "total_rows" in result + + if result["files"]: + table = read_arrow(result["files"]) + assert table is not None + assert table.num_rows == result["total_rows"] diff --git a/tests/replay/test_replay_fidelity.cpp b/tests/replay/test_replay_fidelity.cpp new file mode 100644 index 00000000..1bd7c4b8 --- /dev/null +++ b/tests/replay/test_replay_fidelity.cpp @@ -0,0 +1,271 @@ +// Fidelity tests for ReplayEngine. +// +// "Fidelity" here means: when maintain_timing is on, each event is dispatched +// at wall-clock time close to its scheduled position on the trace timeline. +// Excessive lateness compounds into wrong inter-event gaps, defeating the +// point of timing-preserved replay. Two failure modes we want to catch: +// +// 1. Per-event lateness: an individual event fires more than a few ms +// late vs. when apply_timing should have woken up. +// 2. End-to-end drift: total wall-clock duration diverges from the trace's +// timespan. Sensitive to the apply_timing anchor bug (where +// replay_start_time_ wasn't reset on the first event, making every +// subsequent sleep be skipped). + +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace dftracer::utils; +using namespace dftracer::utils::utilities::replay; + +namespace { + +// Generate a `.pfw` trace with `n` events spaced `step_us` microseconds +// apart. The events name "read" / cat "POSIX" pass the default filters but +// dry_run=true skips actual I/O so the consumer's per-event work is +// dominated by apply_timing sleep. +void write_evenly_spaced_trace(const std::string& path, std::size_t n, + std::uint64_t step_us) { + std::ofstream f(path); + REQUIRE(f.is_open()); + const std::uint64_t base_ts = 1'000'000; + f << "[\n"; + for (std::size_t i = 0; i < n; ++i) { + f << R"({"id":)" << i + << R"(,"name":"read","cat":"POSIX","pid":12345,"tid":12345,"ts":)" + << (base_ts + i * step_us) << R"(,"dur":10,"ph":"X","args":{}})"; + if (i + 1 < n) f << ","; + f << "\n"; + } + f << "]"; +} + +struct DispatchSample { + std::uint64_t trace_ts; // microseconds since arbitrary trace epoch + std::chrono::steady_clock::time_point wall; +}; + +// Capture (trace_ts, wall_now) for each event in dispatch order. on_dispatch +// is invoked from the consumer thread; we still protect the vector since +// future executor changes may dispatch from multiple workers. +struct DispatchRecorder { + std::mutex m; + std::vector samples; + + void record(const Trace& t, std::chrono::steady_clock::time_point now) { + std::lock_guard lock(m); + samples.push_back({t.time_start, now}); + } +}; + +struct FidelityStats { + std::int64_t max_lateness_us = 0; + std::int64_t p99_lateness_us = 0; + std::int64_t total_wall_span_us = 0; + std::int64_t expected_trace_span_us = 0; +}; + +FidelityStats analyze(const std::vector& samples) { + REQUIRE(samples.size() >= 2); + + const auto& first = samples.front(); + const auto& last = samples.back(); + + std::vector lateness; + lateness.reserve(samples.size()); + for (std::size_t i = 0; i < samples.size(); ++i) { + auto expected_offset = + std::chrono::microseconds(samples[i].trace_ts - first.trace_ts); + auto expected_wall = first.wall + expected_offset; + auto delta = std::chrono::duration_cast( + samples[i].wall - expected_wall) + .count(); + lateness.push_back(delta); + } + + FidelityStats out; + out.max_lateness_us = *std::max_element(lateness.begin(), lateness.end()); + auto sorted = lateness; + std::sort(sorted.begin(), sorted.end()); + out.p99_lateness_us = sorted[(sorted.size() * 99) / 100]; + out.total_wall_span_us = + std::chrono::duration_cast(last.wall - + first.wall) + .count(); + out.expected_trace_span_us = + static_cast(last.trace_ts - first.trace_ts); + return out; +} + +// Fidelity tolerances +bool is_ci_env() { + return std::getenv("CI") != nullptr || + std::getenv("GITHUB_ACTIONS") != nullptr; +} + +struct Tolerances { + // Set to a negative value to skip the corresponding check. + std::int64_t max_per_event_us; + std::int64_t max_p99_us; + double wall_span; +}; + +Tolerances tolerances() { + if (is_ci_env()) { + return {/*max_per_event_us=*/-1, /*max_p99_us=*/-1, + /*wall_span=*/1.0}; + } + // Local dev: tight microsecond-grade bounds catch regressions early. + return {/*max_per_event_us=*/10'000, /*max_p99_us=*/5'000, + /*wall_span=*/0.25}; +} + +void check_fidelity(const FidelityStats& s, const char* label) { + const auto t = tolerances(); + INFO("[" << label << " ci=" << is_ci_env() << "] max_lateness=" + << s.max_lateness_us << "us p99=" << s.p99_lateness_us + << "us wall=" << s.total_wall_span_us + << "us trace=" << s.expected_trace_span_us << "us"); + if (t.max_per_event_us >= 0) { + CHECK(s.max_lateness_us <= t.max_per_event_us); + } + if (t.max_p99_us >= 0) { + CHECK(s.p99_lateness_us <= t.max_p99_us); + } + + const std::int64_t low = static_cast( + s.expected_trace_span_us * (1.0 - t.wall_span)); + CHECK(s.total_wall_span_us >= low); + + if (!is_ci_env()) { + const std::int64_t high = static_cast( + s.expected_trace_span_us * (1.0 + t.wall_span)); + CHECK(s.total_wall_span_us <= high); + } +} + +} // namespace + +TEST_CASE("Replay fidelity - sync path") { + DFTRACER_UTILS_LOGGER_INIT(); + + fs::path temp_dir = fs::temp_directory_path() / "dftracer_replay_fid_sync"; + fs::create_directories(temp_dir); + std::string trace_file = (temp_dir / "fid.pfw").string(); + + constexpr std::size_t N = 40; + constexpr std::uint64_t STEP_US = 5'000; + write_evenly_spaced_trace(trace_file, N, STEP_US); + + DispatchRecorder rec; + ReplayConfig config; + config.maintain_timing = true; + config.dry_run = false; + config.on_dispatch = [&rec](const Trace& t, + std::chrono::steady_clock::time_point now) { + rec.record(t, now); + }; + + ReplayEngine engine(config); + auto result = engine.replay(trace_file); + CHECK(result.total_events == N); + + auto stats = analyze(rec.samples); + check_fidelity(stats, "sync"); + + std::error_code ec; + fs::remove_all(temp_dir, ec); +} + +TEST_CASE("Replay fidelity - pipelined path") { + DFTRACER_UTILS_LOGGER_INIT(); + + fs::path temp_dir = + fs::temp_directory_path() / "dftracer_replay_fid_pipelined"; + fs::create_directories(temp_dir); + std::string trace_file = (temp_dir / "fid.pfw").string(); + + constexpr std::size_t N = 40; + constexpr std::uint64_t STEP_US = 5'000; + write_evenly_spaced_trace(trace_file, N, STEP_US); + + DispatchRecorder rec; + ReplayConfig config; + config.maintain_timing = true; + config.dry_run = false; + config.on_dispatch = [&rec](const Trace& t, + std::chrono::steady_clock::time_point now) { + rec.record(t, now); + }; + + ReplayEngine engine(config); + ReplayResult result; + std::vector files = {trace_file}; + + Pipeline pipeline(PipelineConfig::parallel(4)); + auto root = make_task( + [&engine, &files, &result](CoroScope& scope) -> coro::CoroTask { + co_await engine.run_pipelined(scope, files, result, /*cap=*/64); + }, + "replay_pipelined"); + pipeline.set_source(root); + pipeline.execute(); + + CHECK(result.total_events == N); + auto stats = analyze(rec.samples); + check_fidelity(stats, "pipelined"); + + std::error_code ec; + fs::remove_all(temp_dir, ec); +} + +TEST_CASE("Replay fidelity - first-event anchor reset survives warmup gap") { + DFTRACER_UTILS_LOGGER_INIT(); + + fs::path temp_dir = + fs::temp_directory_path() / "dftracer_replay_fid_anchor"; + fs::create_directories(temp_dir); + std::string trace_file = (temp_dir / "fid.pfw").string(); + + constexpr std::size_t N = 20; + constexpr std::uint64_t STEP_US = 5'000; // 95ms span + write_evenly_spaced_trace(trace_file, N, STEP_US); + + DispatchRecorder rec; + ReplayConfig config; + config.maintain_timing = true; + config.dry_run = false; + config.on_dispatch = [&rec](const Trace& t, + std::chrono::steady_clock::time_point now) { + rec.record(t, now); + }; + + ReplayEngine engine(config); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + auto result = engine.replay(trace_file); + CHECK(result.total_events == N); + + auto stats = analyze(rec.samples); + CHECK(stats.total_wall_span_us >= 70'000); + check_fidelity(stats, "anchor-reset"); + + std::error_code ec; + fs::remove_all(temp_dir, ec); +} diff --git a/tests/utilities/CMakeLists.txt b/tests/utilities/CMakeLists.txt index 1818992b..c3ab418f 100644 --- a/tests/utilities/CMakeLists.txt +++ b/tests/utilities/CMakeLists.txt @@ -55,6 +55,7 @@ set(UTILITIES_TEST_SOURCES composites/dft/statistics/test_statistics_aggregator.cpp composites/dft/statistics/test_statistics_query.cpp common/statistics/test_log2_histogram.cpp + common/statistics/test_timestamp_histogram.cpp composites/dft/statistics/test_detailed_statistics.cpp # Query language @@ -67,9 +68,13 @@ set(UTILITIES_TEST_SOURCES composites/dft/aggregators/test_aggregation_metrics.cpp composites/dft/aggregators/test_aggregation_key.cpp composites/dft/aggregators/test_aggregation_config.cpp + composites/dft/aggregators/test_aggregation_serialization.cpp composites/dft/aggregators/test_aggregator_utility.cpp composites/dft/aggregators/test_chunk_aggregator_utility.cpp composites/dft/aggregators/test_event_aggregator_utility.cpp + composites/dft/aggregators/test_aggregation_augmentation.cpp + composites/dft/aggregators/test_system_metrics.cpp + composites/dft/aggregators/test_system_metrics_merge_operator.cpp # DFT Comparator Composites composites/dft/comparator/test_comparison_result.cpp @@ -82,6 +87,7 @@ set(UTILITIES_TEST_SOURCES indexer/test_index_database.cpp indexer/test_provenance_database.cpp indexer/test_index_builder.cpp + indexer/test_sst_ingest_spike.cpp # Compression compression/zlib/test_streaming_compressor.cpp @@ -101,6 +107,12 @@ set(UTILITIES_TEST_SOURCES fileio/test_streaming_file_reader.cpp fileio/test_streaming_file_writer.cpp + # I/O Parallel + fileio/parallel/test_layout_sizing.cpp + fileio/parallel/test_striped_writer.cpp + fileio/parallel/test_sharded_writer.cpp + fileio/parallel/test_padded_striped_writer.cpp + # I/O Lines fileio/lines/test_streaming_line_reader.cpp fileio/lines/sources/test_indexed_file_line_iterator.cpp @@ -160,6 +172,7 @@ endif() # +++++++++++++++++++++++++++++++++++++++++ if(DFTRACER_UTILS_ENABLE_ARROW_IPC) + # IPC Writer tests set(ARROW_IPC_TEST_SOURCE common/arrow/test_arrow_ipc_writer.cpp) string(REPLACE ".cpp" "" arrow_ipc_bin_exec ${ARROW_IPC_TEST_SOURCE}) string(REPLACE "/" "_" arrow_ipc_target "utilities_${arrow_ipc_bin_exec}") @@ -188,6 +201,37 @@ if(DFTRACER_UTILS_ENABLE_ARROW_IPC) add_test(NAME utilities/${arrow_ipc_bin_exec} COMMAND ${arrow_ipc_target}) set_tests_properties(utilities/${arrow_ipc_bin_exec} PROPERTIES WORKING_DIRECTORY "${arrow_ipc_workdir}") + + # IPC Reader tests + set(ARROW_IPC_READER_SOURCE common/arrow/test_arrow_ipc_reader.cpp) + string(REPLACE ".cpp" "" arrow_ipc_reader_bin ${ARROW_IPC_READER_SOURCE}) + string(REPLACE "/" "_" arrow_ipc_reader_target "utilities_${arrow_ipc_reader_bin}") + + add_executable(${arrow_ipc_reader_target} ${ARROW_IPC_READER_SOURCE}) + target_include_directories(${arrow_ipc_reader_target} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..) + target_link_libraries(${arrow_ipc_reader_target} PRIVATE doctest::doctest dftracer_utils testing_utilities) + link_nanoarrow(${arrow_ipc_reader_target} STATIC) + target_set_warnings(${arrow_ipc_reader_target}) + target_enable_coroutine(${arrow_ipc_reader_target}) + + get_filename_component(arrow_ipc_reader_dir ${arrow_ipc_reader_bin} DIRECTORY) + get_filename_component(arrow_ipc_reader_name ${arrow_ipc_reader_bin} NAME) + if(arrow_ipc_reader_dir) + set_target_properties(${arrow_ipc_reader_target} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${arrow_ipc_reader_dir}") + endif() + set_target_properties(${arrow_ipc_reader_target} PROPERTIES OUTPUT_NAME ${arrow_ipc_reader_name}) + + if(DFTRACER_UTILS_COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + target_compile_options(${arrow_ipc_reader_target} PRIVATE --coverage -fprofile-arcs -ftest-coverage) + target_link_libraries(${arrow_ipc_reader_target} PRIVATE --coverage) + endif() + + set(arrow_ipc_reader_workdir "${CMAKE_CURRENT_BINARY_DIR}/workdirs/${arrow_ipc_reader_target}") + file(MAKE_DIRECTORY "${arrow_ipc_reader_workdir}") + add_test(NAME utilities/${arrow_ipc_reader_bin} COMMAND ${arrow_ipc_reader_target}) + set_tests_properties(utilities/${arrow_ipc_reader_bin} PROPERTIES + WORKING_DIRECTORY "${arrow_ipc_reader_workdir}") endif() # +++++++++++++++++++++++++++++++++++++++++ diff --git a/tests/utilities/call_tree/test_call_tree_internal.cpp b/tests/utilities/call_tree/test_call_tree_internal.cpp index 1aea5b54..04f6a4dc 100644 --- a/tests/utilities/call_tree/test_call_tree_internal.cpp +++ b/tests/utilities/call_tree/test_call_tree_internal.cpp @@ -4,6 +4,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include @@ -91,17 +96,18 @@ TEST_CASE("CallTreeFactory - Create nodes") { } SUBCASE("Create node with arguments") { - std::unordered_map args; - args["arg1"] = "value1"; - args["arg2"] = "value2"; + dftracer::utils::call_tree::internal::ArgsMap args; + args.set_valid(true); + args.insert("arg1", std::string("value1")); + args.insert("arg2", std::string("value2")); auto node = factory.create_node(2, "test_func", "category", 2000, 1000, - 1, args); + 1, std::move(args)); CHECK(node != nullptr); - CHECK(node->get_args().size() == 2); - CHECK(node->get_args().at("arg1") == "value1"); - CHECK(node->get_args().at("arg2") == "value2"); + CHECK(node->get_args().raw().size() == 2); + CHECK(node->get_args()["arg1"].get() == "value1"); + CHECK(node->get_args()["arg2"].get() == "value2"); } SUBCASE("Multiple nodes with unique IDs") { @@ -246,3 +252,154 @@ TEST_CASE("CallTree - Integration test with nodes") { tree.cleanup(); } + +// ============================================================================ +// Save / Load round-trips +// ============================================================================ + +namespace { + +using dftracer::utils::CoroScope; +using dftracer::utils::make_task; +using dftracer::utils::Pipeline; +namespace coro = dftracer::utils::coro; +using dftracer::utils::call_tree::load_arrow; +using dftracer::utils::call_tree::load_binary; +using dftracer::utils::call_tree::save_arrow; +using dftracer::utils::call_tree::save_binary; + +std::unique_ptr make_fixture() { + auto tree = std::make_unique(); + tree->initialize(); + + auto add_proc = [&](std::uint32_t pid, std::uint32_t tid, + std::uint32_t pkid) { + ProcessKey key(pid, tid, pkid); + dftracer::utils::utilities::composites::dft::ArgsMap a1; + a1.set_valid(true); + a1.insert("level", static_cast(0)); + a1.insert("tid", static_cast(tid)); + a1.insert("fhash", std::string("abc123")); + auto root = tree->get_factory().create_node(1, "main", "function", 0, + 1000, 0, std::move(a1)); + dftracer::utils::utilities::composites::dft::ArgsMap a2; + a2.set_valid(true); + a2.insert("level", static_cast(1)); + a2.insert("tid", static_cast(tid)); + auto child = tree->get_factory().create_node( + 2, "child", "function", 100, 500, 1, std::move(a2)); + child->set_parent_id(1); + root->add_child(2); + tree->add_call(key, root); + tree->add_call(key, child); + auto* pgraph = tree->get(key); + pgraph->root_calls.push_back(1); + pgraph->call_sequence = {1, 2}; + }; + add_proc(100, 200, 0); + add_proc(101, 201, 0); + return tree; +} + +template +std::unique_ptr roundtrip(const CallTree& src, + const std::string& path, SaveFn save_fn, + LoadFn load_fn, bool* save_ok_out, + bool* load_ok_out) { + struct Ctx { + const CallTree* src; + std::string path; + std::unique_ptr loaded; + bool save_ok = false; + bool load_ok = false; + }; + Ctx ctx{&src, path, nullptr, false, false}; + + Pipeline pipeline; + auto run = make_task( + [&ctx, save_fn, load_fn](CoroScope& scope) -> coro::CoroTask { + ctx.save_ok = co_await save_fn(&scope, *ctx.src, ctx.path); + if (ctx.save_ok) { + ctx.loaded = co_await load_fn(&scope, ctx.path); + ctx.load_ok = (ctx.loaded != nullptr); + } + }, + "save_load"); + pipeline.set_source(run); + pipeline.set_destination(run); + pipeline.execute(); + *save_ok_out = ctx.save_ok; + *load_ok_out = ctx.load_ok; + return std::move(ctx.loaded); +} + +void check_structure_matches(const CallTree& src, const CallTree& loaded) { + auto src_keys = const_cast(src).keys(); + auto loaded_keys = const_cast(loaded).keys(); + CHECK(src_keys.size() == loaded_keys.size()); + + for (const auto& key : src_keys) { + auto* sg = const_cast(src).get(key); + auto* lg = const_cast(loaded).get(key); + REQUIRE(sg != nullptr); + REQUIRE(lg != nullptr); + CHECK(sg->calls.size() == lg->calls.size()); + CHECK(sg->root_calls.size() == lg->root_calls.size()); + CHECK(sg->call_sequence.size() == lg->call_sequence.size()); + for (const auto& [id, sn] : sg->calls) { + auto it = lg->calls.find(id); + REQUIRE(it != lg->calls.end()); + const auto& ln = it->second; + CHECK(sn->get_name() == ln->get_name()); + CHECK(sn->get_category() == ln->get_category()); + CHECK(sn->get_start_time() == ln->get_start_time()); + CHECK(sn->get_duration() == ln->get_duration()); + CHECK(sn->get_level() == ln->get_level()); + CHECK(sn->get_parent_id() == ln->get_parent_id()); + CHECK(sn->get_children().size() == ln->get_children().size()); + CHECK(sn->get_args().raw().size() == ln->get_args().raw().size()); + } + } +} + +} // namespace + +TEST_CASE("CallTree - custom binary save/load round-trip") { + auto tmp = fs::temp_directory_path() / + ("ct_binary_test_" + std::to_string(::getpid())); + fs::remove_all(tmp); + fs::create_directories(tmp); + auto path = (tmp / "tree.bin").string(); + + auto src = make_fixture(); + bool save_ok = false, load_ok = false; + auto loaded = + roundtrip(*src, path, save_binary, load_binary, &save_ok, &load_ok); + REQUIRE(save_ok); + REQUIRE(load_ok); + REQUIRE(loaded != nullptr); + check_structure_matches(*src, *loaded); + + fs::remove_all(tmp); +} + +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC +TEST_CASE("CallTree - arrow IPC save/load round-trip") { + auto tmp = fs::temp_directory_path() / + ("ct_arrow_test_" + std::to_string(::getpid())); + fs::remove_all(tmp); + fs::create_directories(tmp); + auto path = (tmp / "tree.arrow").string(); + + auto src = make_fixture(); + bool save_ok = false, load_ok = false; + auto loaded = + roundtrip(*src, path, save_arrow, load_arrow, &save_ok, &load_ok); + REQUIRE(save_ok); + REQUIRE(load_ok); + REQUIRE(loaded != nullptr); + check_structure_matches(*src, *loaded); + + fs::remove_all(tmp); +} +#endif diff --git a/tests/utilities/common/arrow/test_arrow_column_builder.cpp b/tests/utilities/common/arrow/test_arrow_column_builder.cpp index 1a98462e..0eb25bc2 100644 --- a/tests/utilities/common/arrow/test_arrow_column_builder.cpp +++ b/tests/utilities/common/arrow/test_arrow_column_builder.cpp @@ -1,3 +1,4 @@ +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN diff --git a/tests/utilities/common/arrow/test_arrow_ipc_reader.cpp b/tests/utilities/common/arrow/test_arrow_ipc_reader.cpp new file mode 100644 index 00000000..435cb504 --- /dev/null +++ b/tests/utilities/common/arrow/test_arrow_ipc_reader.cpp @@ -0,0 +1,528 @@ +#include +#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC + +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace dftracer::utils; +using namespace dftracer::utils::coro; +using namespace dftracer::utils::utilities::common::arrow; + +static std::string tmp_path(const char* name) { + return (fs::temp_directory_path() / name).string(); +} + +static void write_test_file(const std::string& path, int num_batches, + int rows_per_batch, + IpcCompression compression = IpcCompression::NONE) { + Runtime runtime(2); + + auto task = [&]() -> CoroTask { + IpcWriter writer; + int rc = co_await writer.open(path, compression); + if (rc != 0) co_return; + + RecordBatchBuilder builder; + builder.declare_schema({{"id", ColumnType::INT64}, + {"name", ColumnType::STRING}, + {"value", ColumnType::DOUBLE}}); + + for (int b = 0; b < num_batches; ++b) { + builder.reserve(rows_per_batch); + for (int i = 0; i < rows_per_batch; ++i) { + int row_id = b * rows_per_batch + i; + builder.append_int64(0, row_id); + std::string name = "item_" + std::to_string(row_id); + builder.append_string(1, name); + builder.append_double(2, row_id * 1.5); + builder.end_row(); + } + auto batch = builder.finish(); + co_await writer.write_batch(batch); + builder.reset(true); + } + + co_await writer.close(); + }; + + runtime.submit(task(), "write_test_file").get(); + runtime.shutdown(); +} + +// --------------------------------------------------------------------------- +// IpcReader Tests +// --------------------------------------------------------------------------- + +TEST_CASE("IpcReader - basic read single batch") { + std::string path = tmp_path("test_ipc_reader_basic.arrow"); + std::remove(path.c_str()); + + write_test_file(path, 1, 10); + + IpcReader reader; + CHECK_FALSE(reader.is_open()); + CHECK(reader.open(path) == 0); + CHECK(reader.is_open()); + CHECK(reader.num_batches() == 1); + + auto batch = reader.read_batch(0); + CHECK(batch.valid()); + CHECK(batch.num_rows() == 10); + CHECK(batch.num_columns() == 3); + + reader.close(); + CHECK_FALSE(reader.is_open()); + + fs::remove(path); +} + +TEST_CASE("IpcReader - read multiple batches") { + std::string path = tmp_path("test_ipc_reader_multi.arrow"); + std::remove(path.c_str()); + + write_test_file(path, 5, 20); + + IpcReader reader; + CHECK(reader.open(path) == 0); + CHECK(reader.num_batches() == 5); + + std::int64_t total_rows = 0; + for (std::size_t i = 0; i < reader.num_batches(); ++i) { + auto batch = reader.read_batch(i); + CHECK(batch.valid()); + CHECK(batch.num_rows() == 20); + total_rows += batch.num_rows(); + } + CHECK(total_rows == 100); + + reader.close(); + fs::remove(path); +} + +TEST_CASE("IpcReader - read_all") { + std::string path = tmp_path("test_ipc_reader_all.arrow"); + std::remove(path.c_str()); + + write_test_file(path, 3, 15); + + IpcReader reader; + CHECK(reader.open(path) == 0); + + auto batches = reader.read_all(); + CHECK(batches.size() == 3); + + std::int64_t total_rows = 0; + for (const auto& batch : batches) { + CHECK(batch.valid()); + total_rows += batch.num_rows(); + } + CHECK(total_rows == 45); + + reader.close(); + fs::remove(path); +} + +TEST_CASE("IpcReader - for_each_batch") { + std::string path = tmp_path("test_ipc_reader_foreach.arrow"); + std::remove(path.c_str()); + + write_test_file(path, 4, 25); + + IpcReader reader; + CHECK(reader.open(path) == 0); + + std::int64_t total_rows = 0; + int batch_count = 0; + int rc = reader.for_each_batch([&](ArrowExportResult& batch) { + CHECK(batch.valid()); + total_rows += batch.num_rows(); + batch_count++; + return 0; + }); + + CHECK(rc == 0); + CHECK(batch_count == 4); + CHECK(total_rows == 100); + + reader.close(); + fs::remove(path); +} + +TEST_CASE("IpcReader - open fails on non-existent file") { + IpcReader reader; + CHECK(reader.open("/nonexistent/path/file.arrow") != 0); + CHECK_FALSE(reader.is_open()); +} + +TEST_CASE("IpcReader - open fails on invalid file") { + std::string path = tmp_path("test_ipc_reader_invalid.arrow"); + std::remove(path.c_str()); + + // Write garbage data + std::FILE* f = std::fopen(path.c_str(), "wb"); + const char* garbage = "this is not an arrow file"; + std::fwrite(garbage, 1, strlen(garbage), f); + std::fclose(f); + + IpcReader reader; + CHECK(reader.open(path) != 0); + CHECK_FALSE(reader.is_open()); + + fs::remove(path); +} + +TEST_CASE("IpcReader - move semantics") { + std::string path = tmp_path("test_ipc_reader_move.arrow"); + std::remove(path.c_str()); + + write_test_file(path, 2, 10); + + IpcReader r1; + CHECK(r1.open(path) == 0); + CHECK(r1.is_open()); + CHECK(r1.num_batches() == 2); + + IpcReader r2 = std::move(r1); + CHECK_FALSE(r1.is_open()); + CHECK(r2.is_open()); + CHECK(r2.num_batches() == 2); + + auto batch = r2.read_batch(0); + CHECK(batch.valid()); + + r2.close(); + fs::remove(path); +} + +TEST_CASE("IpcReader - read batch out of range") { + std::string path = tmp_path("test_ipc_reader_range.arrow"); + std::remove(path.c_str()); + + write_test_file(path, 2, 10); + + IpcReader reader; + CHECK(reader.open(path) == 0); + CHECK(reader.num_batches() == 2); + + // Valid indices + CHECK(reader.read_batch(0).valid()); + CHECK(reader.read_batch(1).valid()); + + // Invalid index + CHECK_FALSE(reader.read_batch(2).valid()); + CHECK_FALSE(reader.read_batch(100).valid()); + + reader.close(); + fs::remove(path); +} + +#ifdef DFTRACER_UTILS_ENABLE_ZSTD +TEST_CASE("IpcReader - read ZSTD compressed file") { + std::string path = tmp_path("test_ipc_reader_zstd.arrow"); + std::remove(path.c_str()); + + write_test_file(path, 3, 50, IpcCompression::ZSTD); + + IpcReader reader; + CHECK(reader.open(path) == 0); + CHECK(reader.num_batches() == 3); + + auto batches = reader.read_all(); + CHECK(batches.size() == 3); + + std::int64_t total_rows = 0; + for (const auto& batch : batches) { + CHECK(batch.valid()); + total_rows += batch.num_rows(); + } + CHECK(total_rows == 150); + + reader.close(); + fs::remove(path); +} +#endif + +TEST_CASE("IpcReader - roundtrip with all column types") { + std::string path = tmp_path("test_ipc_reader_types.arrow"); + std::remove(path.c_str()); + + { + Runtime runtime(2); + + auto task = [&]() -> CoroTask { + IpcWriter writer; + co_await writer.open(path, IpcCompression::NONE); + + RecordBatchBuilder builder; + builder.declare_schema({{"i64", ColumnType::INT64}, + {"u64", ColumnType::UINT64}, + {"f64", ColumnType::DOUBLE}, + {"str", ColumnType::STRING}, + {"boo", ColumnType::BOOL}}); + + builder.reserve(3); + for (int i = 0; i < 3; ++i) { + builder.append_int64(0, -i); + builder.append_uint64(1, i * 100); + builder.append_double(2, i * 1.5); + std::string s = "row_" + std::to_string(i); + builder.append_string(3, s); + builder.append_bool(4, i % 2 == 0); + builder.end_row(); + } + auto batch = builder.finish(); + co_await writer.write_batch(batch); + co_await writer.close(); + }; + + runtime.submit(task(), "write_types").get(); + runtime.shutdown(); + } + + // Read and verify + { + IpcReader reader; + CHECK(reader.open(path) == 0); + CHECK(reader.num_batches() == 1); + + auto batch = reader.read_batch(0); + CHECK(batch.valid()); + CHECK(batch.num_rows() == 3); + CHECK(batch.num_columns() == 5); + + reader.close(); + } + + fs::remove(path); +} + +// --------------------------------------------------------------------------- +// Parallel Reader Tests +// --------------------------------------------------------------------------- + +// Helper to run parallel read coroutine synchronously +static ParallelReadResult run_parallel_read(Runtime& runtime, + std::vector paths) { + auto task = read_arrow_files_parallel(std::move(paths)); + return runtime.submit(std::move(task), "read_arrow_files").get(); +} + +TEST_CASE("read_arrow_files_parallel - single file") { + std::string path = tmp_path("test_parallel_single.arrow"); + std::remove(path.c_str()); + + write_test_file(path, 2, 50); + + Runtime runtime(2); + + std::vector paths = {path}; + auto result = run_parallel_read(runtime, paths); + + CHECK(result.files_read == 1); + CHECK(result.files_failed == 0); + CHECK(result.total_rows == 100); + CHECK(result.total_batches == 2); + CHECK(result.file_results.size() == 1); + CHECK(result.file_results[0].success); + CHECK(result.file_results[0].batches->size() == 2); + + runtime.shutdown(); + fs::remove(path); +} + +TEST_CASE("read_arrow_files_parallel - multiple files") { + std::string dir = tmp_path("test_parallel_multi"); + fs::remove_all(dir); + fs::create_directories(dir); + + std::vector paths; + for (int i = 0; i < 4; ++i) { + std::string path = dir + "/file_" + std::to_string(i) + ".arrow"; + write_test_file(path, 2, 25); + paths.push_back(path); + } + + Runtime runtime(4); + + auto result = run_parallel_read(runtime, paths); + + CHECK(result.files_read == 4); + CHECK(result.files_failed == 0); + CHECK(result.total_rows == 200); // 4 files * 2 batches * 25 rows + CHECK(result.total_batches == 8); // 4 files * 2 batches + CHECK(result.file_results.size() == 4); + + for (const auto& fr : result.file_results) { + CHECK(fr.success); + CHECK(fr.total_rows == 50); + CHECK(fr.batches->size() == 2); + } + + runtime.shutdown(); + fs::remove_all(dir); +} + +TEST_CASE("read_arrow_files_parallel - handles non-existent files") { + std::string path = tmp_path("test_parallel_exists.arrow"); + std::remove(path.c_str()); + write_test_file(path, 1, 10); + + Runtime runtime(2); + + std::vector paths = {path, "/nonexistent/file.arrow"}; + + auto result = run_parallel_read(runtime, paths); + + CHECK(result.files_read == 1); + CHECK(result.files_failed == 1); + CHECK(result.total_rows == 10); + + runtime.shutdown(); + fs::remove(path); +} + +TEST_CASE("read_arrow_files_parallel - empty list") { + Runtime runtime(2); + + std::vector paths; + auto result = run_parallel_read(runtime, paths); + + CHECK(result.files_read == 0); + CHECK(result.files_failed == 0); + CHECK(result.total_rows == 0); + CHECK(result.total_batches == 0); + CHECK(result.file_results.empty()); + + runtime.shutdown(); +} + +TEST_CASE("read_arrow_files_streaming - completion order callback") { + std::string dir = tmp_path("test_streaming"); + fs::remove_all(dir); + fs::create_directories(dir); + + std::vector paths; + for (int i = 0; i < 4; ++i) { + std::string path = dir + "/file_" + std::to_string(i) + ".arrow"; + write_test_file(path, 1, 25); + paths.push_back(path); + } + + Runtime runtime(4); + + std::vector received_paths; + std::int64_t total_rows = 0; + ParallelReadResult result; + + auto task = run_coro_scope( + runtime.executor(), + [&result, &received_paths, &total_rows]( + CoroScope& scope, + std::vector file_paths) -> CoroTask { + result = co_await read_arrow_files_streaming( + scope, std::move(file_paths), [&](ArrowFileReadResult&& fr) { + if (fr.success) { + received_paths.push_back(fr.path); + total_rows += fr.total_rows; + } + return true; // continue + }); + }, + paths); + + runtime.submit(std::move(task), "test_streaming").get(); + + CHECK(result.files_read == 4); + CHECK(result.files_failed == 0); + CHECK(result.total_rows == 100); + CHECK(received_paths.size() == 4); + CHECK(total_rows == 100); + + runtime.shutdown(); + fs::remove_all(dir); +} + +TEST_CASE("read_arrow_files_streaming - early cancel") { + std::string dir = tmp_path("test_streaming_cancel"); + fs::remove_all(dir); + fs::create_directories(dir); + + std::vector paths; + for (int i = 0; i < 4; ++i) { + std::string path = dir + "/file_" + std::to_string(i) + ".arrow"; + write_test_file(path, 1, 25); + paths.push_back(path); + } + + Runtime runtime(4); + + int callback_count = 0; + ParallelReadResult result; + + auto task = run_coro_scope( + runtime.executor(), + [&result, &callback_count]( + CoroScope& scope, + std::vector file_paths) -> CoroTask { + result = co_await read_arrow_files_streaming( + scope, std::move(file_paths), [&](ArrowFileReadResult&&) { + callback_count++; + return callback_count < 2; // cancel after 2 + }); + }, + paths); + + runtime.submit(std::move(task), "test_streaming_cancel").get(); + + // All files still processed (for stats), but callback cancelled early + CHECK(result.files_read == 4); + CHECK(callback_count == 2); // Only 2 callbacks before cancel + + runtime.shutdown(); + fs::remove_all(dir); +} + +#ifdef DFTRACER_UTILS_ENABLE_ZSTD +TEST_CASE("read_arrow_files_parallel - mixed compression") { + std::string dir = tmp_path("test_parallel_mixed"); + fs::remove_all(dir); + fs::create_directories(dir); + + std::string path_none = dir + "/none.arrow"; + std::string path_zstd = dir + "/zstd.arrow"; + + write_test_file(path_none, 2, 30, IpcCompression::NONE); + write_test_file(path_zstd, 2, 30, IpcCompression::ZSTD); + + Runtime runtime(2); + + std::vector paths = {path_none, path_zstd}; + auto result = run_parallel_read(runtime, paths); + + CHECK(result.files_read == 2); + CHECK(result.files_failed == 0); + CHECK(result.total_rows == 120); + CHECK(result.total_batches == 4); + + runtime.shutdown(); + fs::remove_all(dir); +} +#endif + +#else + +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include + +TEST_CASE("IpcReader - disabled") { CHECK(true); } + +#endif // DFTRACER_UTILS_ENABLE_ARROW_IPC diff --git a/tests/utilities/common/arrow/test_arrow_ipc_writer.cpp b/tests/utilities/common/arrow/test_arrow_ipc_writer.cpp index 7ceb0e3e..7160257e 100644 --- a/tests/utilities/common/arrow/test_arrow_ipc_writer.cpp +++ b/tests/utilities/common/arrow/test_arrow_ipc_writer.cpp @@ -1,57 +1,59 @@ +#include #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include +#include +#include #include #include #include #include +using namespace dftracer::utils; +using namespace dftracer::utils::coro; using namespace dftracer::utils::utilities::common::arrow; -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - static std::string tmp_path(const char* name) { return (fs::temp_directory_path() / name).string(); } -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - TEST_CASE("IpcWriter - basic write and close") { - RecordBatchBuilder builder; - builder.declare_schema({{"id", ColumnType::INT64}, - {"name", ColumnType::STRING}, - {"value", ColumnType::DOUBLE}}); - builder.reserve(2); - - std::string s0 = "hello", s1 = "world"; - builder.append_int64(0, 1); - builder.append_string(1, s0); - builder.append_double(2, 3.14); - builder.end_row(); - builder.append_int64(0, 2); - builder.append_string(1, s1); - builder.append_double(2, 2.72); - builder.end_row(); - - auto batch = builder.finish(); - std::string path = tmp_path("test_ipc_basic.arrows"); std::remove(path.c_str()); - IpcWriter writer; - CHECK_FALSE(writer.is_open()); - CHECK(writer.open(path) == 0); - CHECK(writer.is_open()); - CHECK(writer.write_batch(batch) == 0); - CHECK(writer.close() == 0); - CHECK_FALSE(writer.is_open()); + Runtime runtime(2); + auto task = [&]() -> CoroTask { + RecordBatchBuilder builder; + builder.declare_schema({{"id", ColumnType::INT64}, + {"name", ColumnType::STRING}, + {"value", ColumnType::DOUBLE}}); + builder.reserve(2); + + std::string s0 = "hello", s1 = "world"; + builder.append_int64(0, 1); + builder.append_string(1, s0); + builder.append_double(2, 3.14); + builder.end_row(); + builder.append_int64(0, 2); + builder.append_string(1, s1); + builder.append_double(2, 2.72); + builder.end_row(); + auto batch = builder.finish(); + + IpcWriter writer; + if (co_await writer.open(path) != 0) co_return 1; + if (!writer.is_open()) co_return 2; + if (co_await writer.write_batch(batch) != 0) co_return 3; + if (co_await writer.close() != 0) co_return 4; + if (writer.is_open()) co_return 5; + co_return 0; + }; + auto result = runtime.submit(task(), "test").get(); + + CHECK(result == 0); CHECK(fs::exists(path)); CHECK(fs::file_size(path) > 0); fs::remove(path); @@ -61,24 +63,31 @@ TEST_CASE("IpcWriter - multiple batches") { std::string path = tmp_path("test_ipc_multi.arrows"); std::remove(path.c_str()); - IpcWriter writer; - CHECK(writer.open(path) == 0); - - RecordBatchBuilder builder; - builder.declare_schema({{"x", ColumnType::INT64}}); - - for (int b = 0; b < 3; ++b) { - builder.reserve(10); - for (int i = 0; i < 10; ++i) { - builder.append_int64(0, b * 10 + i); - builder.end_row(); + Runtime runtime(2); + auto task = [&]() -> CoroTask { + IpcWriter writer; + if (co_await writer.open(path) != 0) co_return 1; + + RecordBatchBuilder builder; + builder.declare_schema({{"x", ColumnType::INT64}}); + + for (int b = 0; b < 3; ++b) { + builder.reserve(10); + for (int i = 0; i < 10; ++i) { + builder.append_int64(0, b * 10 + i); + builder.end_row(); + } + auto batch = builder.finish(); + if (co_await writer.write_batch(batch) != 0) co_return 2; + builder.reset(true); } - auto batch = builder.finish(); - CHECK(writer.write_batch(batch) == 0); - builder.reset(true); - } - CHECK(writer.close() == 0); + if (co_await writer.close() != 0) co_return 3; + co_return 0; + }; + auto result = runtime.submit(task(), "test").get(); + + CHECK(result == 0); CHECK(fs::exists(path)); CHECK(fs::file_size(path) > 0); fs::remove(path); @@ -88,91 +97,184 @@ TEST_CASE("IpcWriter - close without writing batches") { std::string path = tmp_path("test_ipc_empty.arrows"); std::remove(path.c_str()); - IpcWriter writer; - CHECK(writer.open(path) == 0); - // No write_batch calls — close should still succeed (no footer needed). - CHECK(writer.close() == 0); - CHECK_FALSE(writer.is_open()); + Runtime runtime(2); + auto task = [&]() -> CoroTask { + IpcWriter writer; + if (co_await writer.open(path) != 0) co_return 1; + if (co_await writer.close() != 0) co_return 2; + if (writer.is_open()) co_return 3; + co_return 0; + }; + auto result = runtime.submit(task(), "test").get(); + + CHECK(result == 0); fs::remove(path); } -TEST_CASE("IpcWriter - double close is safe") { - std::string path = tmp_path("test_ipc_dblclose.arrows"); +#ifdef DFTRACER_UTILS_ENABLE_ZSTD +TEST_CASE("IpcWriter - explicit ZSTD compression") { + std::string path = tmp_path("test_ipc_zstd_compression.arrows"); std::remove(path.c_str()); - IpcWriter writer; - CHECK(writer.open(path) == 0); - CHECK(writer.close() == 0); - CHECK(writer.close() == 0); // idempotent + Runtime runtime(2); + auto task = [&]() -> CoroTask { + RecordBatchBuilder builder; + builder.declare_schema( + {{"x", ColumnType::INT64}, {"y", ColumnType::DOUBLE}}); + builder.reserve(100); + for (int i = 0; i < 100; ++i) { + builder.append_int64(0, i); + builder.append_double(1, i * 1.5); + builder.end_row(); + } + auto batch = builder.finish(); + + IpcWriter writer; + if (co_await writer.open(path, IpcCompression::ZSTD) != 0) co_return 1; + if (co_await writer.write_batch(batch) != 0) co_return 2; + if (co_await writer.close() != 0) co_return 3; + co_return 0; + }; + auto result = runtime.submit(task(), "test").get(); + + CHECK(result == 0); + CHECK(fs::exists(path)); + CHECK(fs::file_size(path) > 0); fs::remove(path); } +#endif -TEST_CASE("IpcWriter - move semantics") { - std::string path = tmp_path("test_ipc_move.arrows"); - std::remove(path.c_str()); +TEST_CASE("PartitionWriter - basic single file") { + std::string dir = tmp_path("test_partition_basic"); + fs::remove_all(dir); - IpcWriter w1; - CHECK(w1.open(path) == 0); - CHECK(w1.is_open()); + Runtime runtime(2); + PartitionWriteStats stats; - IpcWriter w2 = std::move(w1); - CHECK_FALSE(w1.is_open()); - CHECK(w2.is_open()); + auto task = [&]() -> CoroTask { + PartitionWriter writer; + if (co_await writer.open(dir, 0) != 0) co_return 1; + if (!writer.is_open()) co_return 2; - RecordBatchBuilder builder; - builder.declare_schema({{"v", ColumnType::UINT64}}); - builder.append_uint64(0, 42); - builder.end_row(); - auto batch = builder.finish(); - - CHECK(w2.write_batch(batch) == 0); - CHECK(w2.close() == 0); + RecordBatchBuilder builder; + builder.declare_schema({{"id", ColumnType::INT64}}); + builder.reserve(100); + for (int i = 0; i < 100; ++i) { + builder.append_int64(0, i); + builder.end_row(); + } + auto batch = builder.finish(); - CHECK(fs::exists(path)); - CHECK(fs::file_size(path) > 0); - fs::remove(path); + if (co_await writer.write_batch(batch) != 0) co_return 3; + stats = co_await writer.close(); + if (writer.is_open()) co_return 4; + co_return 0; + }; + auto result = runtime.submit(task(), "test").get(); + + CHECK(result == 0); + CHECK(stats.files.size() == 1); + CHECK(stats.total_rows == 100); + CHECK(stats.row_counts.size() == 1); + CHECK(stats.row_counts[0] == 100); + CHECK(fs::exists(stats.files[0])); + + fs::remove_all(dir); } -TEST_CASE("IpcWriter - open fails on bad path") { - IpcWriter writer; - CHECK(writer.open("/nonexistent_dir/no_such_file.arrows") != 0); - CHECK_FALSE(writer.is_open()); +TEST_CASE("PartitionRouter - NONE mode pass-through") { + std::string dir = tmp_path("test_router_none"); + fs::remove_all(dir); + + PartitionConfig config; + config.mode = PartitionConfig::Mode::NONE; + + Runtime runtime(2); + RouterWriteStats stats; + + auto task = [&]() -> CoroTask { + PartitionRouter router; + if (router.open(dir, config, 0) != 0) co_return 1; + + RecordBatchBuilder builder; + builder.declare_schema( + {{"id", ColumnType::INT64}, {"cat", ColumnType::STRING}}); + builder.reserve(10); + for (int i = 0; i < 10; ++i) { + builder.append_int64(0, i); + builder.append_string(1, "POSIX"); + builder.end_row(); + } + auto batch = builder.finish(); + + if (co_await router.write_batch(batch) != 0) co_return 2; + stats = co_await router.close(); + co_return 0; + }; + auto result = runtime.submit(task(), "test").get(); + + CHECK(result == 0); + CHECK(stats.total_rows == 10); + CHECK(stats.partitions.size() == 1); + CHECK(stats.partitions.count("") == 1); + + fs::remove_all(dir); } -TEST_CASE("IpcWriter - all column types") { - std::string path = tmp_path("test_ipc_types.arrows"); - std::remove(path.c_str()); +TEST_CASE("PartitionRouter - COLUMN mode single column") { + std::string dir = tmp_path("test_router_column"); + fs::remove_all(dir); - RecordBatchBuilder builder; - builder.declare_schema({{"i64", ColumnType::INT64}, - {"u64", ColumnType::UINT64}, - {"f64", ColumnType::DOUBLE}, - {"str", ColumnType::STRING}, - {"boo", ColumnType::BOOL}}); + PartitionConfig config; + config.mode = PartitionConfig::Mode::COLUMN; + config.partition_columns = {"cat"}; - std::string sv = "test"; - builder.append_int64(0, -1); - builder.append_uint64(1, 1); - builder.append_double(2, 1.0); - builder.append_string(3, sv); - builder.append_bool(4, true); - builder.end_row(); + Runtime runtime(2); + RouterWriteStats stats; - auto batch = builder.finish(); + auto task = [&]() -> CoroTask { + PartitionRouter router; + if (router.open(dir, config, 0) != 0) co_return 1; - IpcWriter writer; - CHECK(writer.open(path) == 0); - CHECK(writer.write_batch(batch) == 0); - CHECK(writer.close() == 0); + RecordBatchBuilder builder; + builder.declare_schema( + {{"id", ColumnType::INT64}, {"cat", ColumnType::STRING}}); + builder.reserve(6); - CHECK(fs::exists(path)); - CHECK(fs::file_size(path) > 0); - fs::remove(path); + for (int i = 0; i < 3; ++i) { + builder.append_int64(0, i); + builder.append_string(1, "POSIX"); + builder.end_row(); + } + for (int i = 3; i < 6; ++i) { + builder.append_int64(0, i); + builder.append_string(1, "APP"); + builder.end_row(); + } + auto batch = builder.finish(); + + if (co_await router.write_batch(batch) != 0) co_return 2; + stats = co_await router.close(); + co_return 0; + }; + auto result = runtime.submit(task(), "test").get(); + + CHECK(result == 0); + CHECK(stats.total_rows == 6); + CHECK(stats.partitions.size() == 2); + CHECK(stats.partitions.count("cat=POSIX") == 1); + CHECK(stats.partitions.count("cat=APP") == 1); + CHECK(stats.partitions["cat=POSIX"].total_rows == 3); + CHECK(stats.partitions["cat=APP"].total_rows == 3); + + CHECK(fs::exists(dir + "/cat=POSIX")); + CHECK(fs::exists(dir + "/cat=APP")); + + fs::remove_all(dir); } #else -// Provide main when IPC is disabled so the binary still links. #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include diff --git a/tests/utilities/common/query/test_evaluator.cpp b/tests/utilities/common/query/test_evaluator.cpp index 74c4639f..fb6cda58 100644 --- a/tests/utilities/common/query/test_evaluator.cpp +++ b/tests/utilities/common/query/test_evaluator.cpp @@ -2,10 +2,9 @@ #include #include #include -#include +#include -#include -#include +#include using namespace dftracer::utils::utilities::common::query; using dftracer::utils::utilities::common::json::JsonValue; @@ -13,19 +12,25 @@ using dftracer::utils::utilities::common::json::JsonValue; namespace { struct JsonDoc { - yyjson_doc* doc; - JsonDoc(const char* json) : doc(yyjson_read(json, std::strlen(json), 0)) {} - ~JsonDoc() { - if (doc) yyjson_doc_free(doc); + simdjson::dom::parser parser; + simdjson::dom::element elem; + bool valid = false; + + JsonDoc(const char* json) { + auto result = parser.parse(json, std::strlen(json)); + if (!result.error()) { + elem = result.value_unsafe(); + valid = true; + } } - JsonValue root() { return JsonValue(yyjson_doc_get_root(doc)); } + JsonValue root() { return valid ? JsonValue(elem) : JsonValue(); } }; bool eval(const char* query_str, const char* json_str) { auto ast = parse(query_str); REQUIRE(ast.has_value()); JsonDoc doc(json_str); - REQUIRE(doc.doc != nullptr); + REQUIRE(doc.valid); return evaluate(**ast, doc.root()); } diff --git a/tests/utilities/common/query/test_query.cpp b/tests/utilities/common/query/test_query.cpp index 9f86f2be..e0f627db 100644 --- a/tests/utilities/common/query/test_query.cpp +++ b/tests/utilities/common/query/test_query.cpp @@ -1,7 +1,7 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include -#include +#include #include @@ -11,12 +11,18 @@ using dftracer::utils::utilities::common::json::JsonValue; namespace { struct JsonDoc { - yyjson_doc* doc; - JsonDoc(const char* json) : doc(yyjson_read(json, std::strlen(json), 0)) {} - ~JsonDoc() { - if (doc) yyjson_doc_free(doc); + simdjson::dom::parser parser; + simdjson::dom::element elem; + bool valid = false; + + JsonDoc(const char* json) { + auto result = parser.parse(json, std::strlen(json)); + if (!result.error()) { + elem = result.value_unsafe(); + valid = true; + } } - JsonValue root() { return JsonValue(yyjson_doc_get_root(doc)); } + JsonValue root() { return valid ? JsonValue(elem) : JsonValue(); } }; } // namespace @@ -85,3 +91,62 @@ TEST_CASE("Query with NOT IN") { JsonDoc no_match(R"({"cat":"STDIO"})"); CHECK_FALSE(q->evaluate(no_match.root())); } + +TEST_CASE("Query::fields - simple equality") { + auto q = Query::from_string(R"(cat == "POSIX")"); + REQUIRE(q.has_value()); + auto& f = q->fields(); + CHECK(f.size() == 1); + CHECK(f.count("cat") == 1); +} + +TEST_CASE("Query::fields - compound OR") { + auto q = Query::from_string(R"(pid == 1 or tid == 2)"); + REQUIRE(q.has_value()); + auto& f = q->fields(); + CHECK(f.size() == 2); + CHECK(f.count("pid") == 1); + CHECK(f.count("tid") == 1); +} + +TEST_CASE("Query::fields - compound AND") { + auto q = Query::from_string(R"(cat == "POSIX" and dur > 100)"); + REQUIRE(q.has_value()); + auto& f = q->fields(); + CHECK(f.size() == 2); + CHECK(f.count("cat") == 1); + CHECK(f.count("dur") == 1); +} + +TEST_CASE("Query::fields - NOT query") { + auto q = Query::from_string(R"(not cat == "STDIO")"); + REQUIRE(q.has_value()); + auto& f = q->fields(); + CHECK(f.size() == 1); + CHECK(f.count("cat") == 1); +} + +TEST_CASE("Query::fields - IN query") { + auto q = Query::from_string(R"(cat in ["POSIX", "STDIO"])"); + REQUIRE(q.has_value()); + auto& f = q->fields(); + CHECK(f.size() == 1); + CHECK(f.count("cat") == 1); +} + +TEST_CASE("Query::references") { + auto q = Query::from_string(R"(pid == 1 and dur > 50)"); + REQUIRE(q.has_value()); + CHECK(q->references("pid")); + CHECK(q->references("dur")); + CHECK_FALSE(q->references("cat")); + CHECK_FALSE(q->references("tid")); +} + +TEST_CASE("Query::fields - no duplicates for repeated field") { + auto q = Query::from_string(R"(pid == 1 or pid == 2)"); + REQUIRE(q.has_value()); + auto& f = q->fields(); + CHECK(f.size() == 1); + CHECK(f.count("pid") == 1); +} diff --git a/tests/utilities/common/statistics/test_timestamp_histogram.cpp b/tests/utilities/common/statistics/test_timestamp_histogram.cpp new file mode 100644 index 00000000..a5cf5ecc --- /dev/null +++ b/tests/utilities/common/statistics/test_timestamp_histogram.cpp @@ -0,0 +1,240 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include + +#include + +using namespace dftracer::utils::utilities::common::statistics; + +TEST_SUITE("TimestampHistogram") { + TEST_CASE("empty histogram") { + TimestampHistogram h; + CHECK(h.empty()); + CHECK(h.total_count() == 0); + CHECK(h.num_bins() == 0); + CHECK(h.count_in_range(0, 1'000'000) == 0); + CHECK(h.selectivity(0, 1'000'000) == 0.0); + } + + TEST_CASE("single event") { + TimestampHistogram h; + h.add(500'000); // 500ms -> bin 5 + + CHECK(h.total_count() == 1); + CHECK(h.num_bins() == 1); + CHECK(h.bins()[0].first == 5); + CHECK(h.bins()[0].second == 1); + } + + TEST_CASE("bin_index static") { + CHECK(TimestampHistogram::bin_index(0) == 0); + CHECK(TimestampHistogram::bin_index(99'999) == 0); + CHECK(TimestampHistogram::bin_index(100'000) == 1); + CHECK(TimestampHistogram::bin_index(199'999) == 1); + CHECK(TimestampHistogram::bin_index(200'000) == 2); + CHECK(TimestampHistogram::bin_index(1'000'000) == 10); + } + + TEST_CASE("bin_start_us and bin_end_us") { + CHECK(TimestampHistogram::bin_start_us(0) == 0); + CHECK(TimestampHistogram::bin_end_us(0) == 100'000); + CHECK(TimestampHistogram::bin_start_us(10) == 1'000'000); + CHECK(TimestampHistogram::bin_end_us(10) == 1'100'000); + } + + TEST_CASE("multiple events in same bin") { + TimestampHistogram h; + h.add(150'000); + h.add(160'000); + h.add(190'000); + + CHECK(h.total_count() == 3); + CHECK(h.num_bins() == 1); + CHECK(h.bins()[0].first == 1); + CHECK(h.bins()[0].second == 3); + } + + TEST_CASE("events across bins") { + TimestampHistogram h; + h.add(50'000); // bin 0 + h.add(150'000); // bin 1 + h.add(250'000); // bin 2 + h.add(1'050'000); // bin 10 + + CHECK(h.total_count() == 4); + CHECK(h.num_bins() == 4); + CHECK(h.bins()[0] == + std::make_pair(std::uint64_t{0}, std::uint64_t{1})); + CHECK(h.bins()[1] == + std::make_pair(std::uint64_t{1}, std::uint64_t{1})); + CHECK(h.bins()[2] == + std::make_pair(std::uint64_t{2}, std::uint64_t{1})); + CHECK(h.bins()[3] == + std::make_pair(std::uint64_t{10}, std::uint64_t{1})); + } + + TEST_CASE("count_in_range") { + TimestampHistogram h; + // 10 events at 0.0-0.1s, 20 at 0.5-0.6s, 5 at 1.0-1.1s + for (int i = 0; i < 10; ++i) h.add(50'000); + for (int i = 0; i < 20; ++i) h.add(550'000); + for (int i = 0; i < 5; ++i) h.add(1'050'000); + + CHECK(h.count_in_range(0, 100'000) == 10); + CHECK(h.count_in_range(0, 600'000) == 30); + CHECK(h.count_in_range(0, 2'000'000) == 35); + CHECK(h.count_in_range(500'000, 600'000) == 20); + CHECK(h.count_in_range(500'000, 1'100'000) == 25); + CHECK(h.count_in_range(200'000, 400'000) == 0); + } + + TEST_CASE("selectivity") { + TimestampHistogram h; + for (int i = 0; i < 100; ++i) h.add(50'000); + for (int i = 0; i < 100; ++i) h.add(550'000); + + CHECK(h.selectivity(0, 100'000) == doctest::Approx(0.5)); + CHECK(h.selectivity(500'000, 600'000) == doctest::Approx(0.5)); + CHECK(h.selectivity(0, 600'000) == doctest::Approx(1.0)); + CHECK(h.selectivity(200'000, 400'000) == doctest::Approx(0.0)); + } + + TEST_CASE("merge") { + TimestampHistogram a; + a.add(50'000); // bin 0 + a.add(150'000); // bin 1 + + TimestampHistogram b; + b.add(50'000); // bin 0 + b.add(250'000); // bin 2 + + a.merge(b); + + CHECK(a.total_count() == 4); + CHECK(a.num_bins() == 3); + CHECK(a.bins()[0] == + std::make_pair(std::uint64_t{0}, std::uint64_t{2})); + CHECK(a.bins()[1] == + std::make_pair(std::uint64_t{1}, std::uint64_t{1})); + CHECK(a.bins()[2] == + std::make_pair(std::uint64_t{2}, std::uint64_t{1})); + } + + TEST_CASE("merge with empty") { + TimestampHistogram a; + a.add(50'000); + + TimestampHistogram empty; + a.merge(empty); + + CHECK(a.total_count() == 1); + CHECK(a.num_bins() == 1); + } + + TEST_CASE("expansion_weights - uniform") { + TimestampHistogram h; + for (int i = 0; i < 100; ++i) h.add(i * 10'000); // 0-1s uniform + + auto weights = h.expansion_weights(0, 1'000'000, 10); + CHECK(weights.size() == 10); + for (auto w : weights) { + CHECK(w == doctest::Approx(0.1).epsilon(0.01)); + } + } + + TEST_CASE("expansion_weights - bursty") { + TimestampHistogram h; + // 800 events in 0.2-0.4s, 200 events elsewhere + for (int i = 0; i < 100; ++i) h.add(50'000); // bin 0 + for (int i = 0; i < 400; ++i) h.add(250'000); // bin 2 + for (int i = 0; i < 400; ++i) h.add(350'000); // bin 3 + for (int i = 0; i < 100; ++i) h.add(950'000); // bin 9 + + auto weights = h.expansion_weights(0, 1'000'000, 5); + CHECK(weights.size() == 5); + // sub 0 [0-200ms]: bin 0 = 100 + // sub 1 [200-400ms]: bins 2+3 = 800 + // sub 2 [400-600ms]: 0 + // sub 3 [600-800ms]: 0 + // sub 4 [800-1000ms]: bin 9 = 100 + CHECK(weights[0] == doctest::Approx(0.1).epsilon(0.01)); + CHECK(weights[1] == doctest::Approx(0.8).epsilon(0.01)); + CHECK(weights[2] == doctest::Approx(0.0)); + CHECK(weights[3] == doctest::Approx(0.0)); + CHECK(weights[4] == doctest::Approx(0.1).epsilon(0.01)); + } + + TEST_CASE("expansion_weights - no data in range falls back to uniform") { + TimestampHistogram h; + h.add(5'000'000); // 5s, outside query range + + auto weights = h.expansion_weights(0, 1'000'000, 5); + CHECK(weights.size() == 5); + for (auto w : weights) { + CHECK(w == doctest::Approx(0.2)); + } + } + + TEST_CASE("serialize and deserialize roundtrip") { + TimestampHistogram h; + h.add(50'000); + h.add(150'000); + h.add(150'000); + h.add(1'000'050'000); + + auto data = h.serialize(); + auto h2 = TimestampHistogram::deserialize(data.data(), data.size()); + + CHECK(h2.total_count() == h.total_count()); + CHECK(h2.num_bins() == h.num_bins()); + REQUIRE(h2.bins().size() == h.bins().size()); + for (std::size_t i = 0; i < h.bins().size(); ++i) { + CHECK(h2.bins()[i].first == h.bins()[i].first); + CHECK(h2.bins()[i].second == h.bins()[i].second); + } + } + + TEST_CASE("serialize empty") { + TimestampHistogram h; + auto data = h.serialize(); + auto h2 = TimestampHistogram::deserialize(data.data(), data.size()); + CHECK(h2.empty()); + CHECK(h2.total_count() == 0); + } + + TEST_CASE("deserialize null/empty") { + auto h = TimestampHistogram::deserialize(nullptr, 0); + CHECK(h.empty()); + } + + TEST_CASE("varint encoding handles large timestamps") { + TimestampHistogram h; + // Typical 2026 timestamp: ~1.77e15 us + h.add(1'773'074'570'000'000ULL); + h.add(1'773'074'570'100'000ULL); + + auto data = h.serialize(); + auto h2 = TimestampHistogram::deserialize(data.data(), data.size()); + + CHECK(h2.total_count() == 2); + CHECK(h2.num_bins() == 2); + CHECK(h2.bins()[0].first == h.bins()[0].first); + CHECK(h2.bins()[1].first == h.bins()[1].first); + } + + TEST_CASE("serialization is compact with delta encoding") { + TimestampHistogram h; + // 100 consecutive bins (10s of data) + std::uint64_t base = 17'730'745'700ULL; // ~2026 timestamp / 100ms + for (std::uint64_t i = 0; i < 100; ++i) { + for (int j = 0; j < 50; ++j) { + h.add((base + i) * 100'000 + j * 1000); + } + } + + auto data = h.serialize(); + // 100 bins with delta=1 each = ~1 byte per delta + ~1 byte per count + // Plus header. Should be well under 500 bytes. + CHECK(data.size() < 500); + } +} diff --git a/tests/utilities/composites/dft/aggregators/test_aggregation_augmentation.cpp b/tests/utilities/composites/dft/aggregators/test_aggregation_augmentation.cpp new file mode 100644 index 00000000..f155bff5 --- /dev/null +++ b/tests/utilities/composites/dft/aggregators/test_aggregation_augmentation.cpp @@ -0,0 +1,128 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include + +using namespace dftracer::utils::utilities::composites::dft::aggregators; + +namespace { + +AggregationBatch create_test_batch(std::uint64_t time_bucket, + std::uint64_t count, std::uint64_t ts, + std::uint64_t te) { + AggregationBatch batch; + AggregationKey key; + key.cat_id = 1; + key.name_id = 1; + key.pid = 100; + key.tid = 1; + key.time_bucket = time_bucket; + + AggregationMetrics metrics; + metrics.count = count; + metrics.ts = ts; + metrics.te = te; + metrics.duration.count = count; + metrics.duration.total = count * 1000; + metrics.duration.mean = 1000.0; + + batch.entries.emplace_back(key, metrics); + return batch; +} + +} // namespace + +TEST_SUITE("AggregationAugmentation") { + TEST_CASE("PassThrough - Same interval") { + auto batch = create_test_batch(0, 100, 0, 5000); + + AugmentationConfig config{5000, 5000}; + auto result = augment_batch(batch, config); + + CHECK_FALSE(result.has_approximated_entries); + REQUIRE(result.entries.size() == 1); + CHECK(result.entries[0].metrics.count == 100); + CHECK_FALSE(result.entries[0].is_approximated); + } + + TEST_CASE("Shrink - Merge buckets") { + AggregationBatch batch; + + for (std::uint64_t i = 0; i < 5; ++i) { + AggregationKey key; + key.cat_id = 1; + key.name_id = 1; + key.pid = 100; + key.tid = 1; + key.time_bucket = i; + + AggregationMetrics metrics; + metrics.count = 20; + metrics.ts = i * 1000; + metrics.te = (i + 1) * 1000; + metrics.duration.count = 20; + metrics.duration.total = 20000; + + batch.entries.emplace_back(key, metrics); + } + + AugmentationConfig config{1000, 5000}; // shrink 5x + auto result = augment_batch(batch, config); + + CHECK_FALSE(result.has_approximated_entries); + REQUIRE(result.entries.size() == 1); + CHECK(result.entries[0].metrics.count == 100); // 5 * 20 + CHECK(result.entries[0].key.time_bucket == 0); + } + + TEST_CASE("Expand - Split bucket") { + auto batch = create_test_batch(0, 100, 1000, 4000); + + AugmentationConfig config{5000, 1000}; // expand 5x + auto result = augment_batch(batch, config); + + CHECK(result.has_approximated_entries); + + std::uint64_t total_count = 0; + for (const auto& entry : result.entries) { + CHECK(entry.is_approximated); + CHECK(entry.key.time_bucket >= 1); + CHECK(entry.key.time_bucket <= 3); + total_count += entry.metrics.count; + CHECK(entry.count_ci.upper > 0); + } + + CHECK(total_count == 100); + } + + TEST_CASE("Expand - All events at same time") { + auto batch = create_test_batch(0, 100, 2500, 2500); + + AugmentationConfig config{5000, 1000}; + auto result = augment_batch(batch, config); + + CHECK(result.has_approximated_entries); + REQUIRE(result.entries.size() == 1); + CHECK(result.entries[0].key.time_bucket == 2); + CHECK(result.entries[0].metrics.count == 100); + } + + TEST_CASE("Poisson CI calculation") { + SUBCASE("Count = 100") { + auto ci = compute_poisson_ci(100.0); + CHECK(ci.lower == doctest::Approx(80.4).epsilon(0.01)); + CHECK(ci.upper == doctest::Approx(119.6).epsilon(0.01)); + } + + SUBCASE("Count = 4") { + auto ci = compute_poisson_ci(4.0); + CHECK(ci.lower == doctest::Approx(0.08).epsilon(0.1)); + CHECK(ci.upper == doctest::Approx(7.92).epsilon(0.1)); + } + + SUBCASE("Count = 0") { + auto ci = compute_poisson_ci(0.0); + CHECK(ci.lower == 0.0); + CHECK(ci.upper == 0.0); + } + } +} diff --git a/tests/utilities/composites/dft/aggregators/test_aggregation_metrics.cpp b/tests/utilities/composites/dft/aggregators/test_aggregation_metrics.cpp index 4cd826eb..eb00e16b 100644 --- a/tests/utilities/composites/dft/aggregators/test_aggregation_metrics.cpp +++ b/tests/utilities/composites/dft/aggregators/test_aggregation_metrics.cpp @@ -9,11 +9,11 @@ using namespace dftracer::utils::utilities::composites::dft::aggregators; TEST_SUITE("MetricStats") { TEST_CASE("MetricStats - Single value") { MetricStats stats; - std::uint64_t count = 1; - stats.update(42, count); + stats.update(42); + CHECK(stats.count == 1); CHECK(stats.mean == doctest::Approx(42.0)); - CHECK(stats.get_stddev(count) == 0.0); + CHECK(stats.get_stddev() == 0.0); CHECK(stats.total == 42); CHECK(stats.min == 42); CHECK(stats.max == 42); @@ -21,16 +21,17 @@ TEST_SUITE("MetricStats") { TEST_CASE("MetricStats - Two values") { MetricStats stats; - stats.update(10, 1); - stats.update(20, 2); + stats.update(10); + stats.update(20); + CHECK(stats.count == 2); CHECK(stats.mean == doctest::Approx(15.0)); CHECK(stats.total == 30); CHECK(stats.min == 10); CHECK(stats.max == 20); // stddev = sqrt(((10-15)^2 + (20-15)^2) / 1) = sqrt(50) ~ 7.071 - double stddev = stats.get_stddev(2); + double stddev = stats.get_stddev(); CHECK(stddev == doctest::Approx(std::sqrt(50.0)).epsilon(0.001)); } @@ -39,10 +40,11 @@ TEST_SUITE("MetricStats") { // Mean = 40/8 = 5.0 MetricStats stats; std::vector values = {2, 4, 4, 4, 5, 5, 7, 9}; - for (std::uint64_t i = 0; i < values.size(); ++i) { - stats.update(values[i], i + 1); + for (auto v : values) { + stats.update(v); } + CHECK(stats.count == 8); CHECK(stats.mean == doctest::Approx(5.0)); CHECK(stats.total == 40); CHECK(stats.min == 2); @@ -50,64 +52,67 @@ TEST_SUITE("MetricStats") { // Sample stddev = sqrt(sum((x-mean)^2) / (n-1)) // = sqrt((9+1+1+1+0+0+4+16)/7) = sqrt(32/7) ~ 2.138 - double stddev = stats.get_stddev(8); + double stddev = stats.get_stddev(); CHECK(stddev == doctest::Approx(std::sqrt(32.0 / 7.0)).epsilon(0.01)); } TEST_CASE("MetricStats - Identical values") { MetricStats stats; for (std::uint64_t i = 0; i < 10; ++i) { - stats.update(5, i + 1); + stats.update(5); } + CHECK(stats.count == 10); CHECK(stats.mean == doctest::Approx(5.0)); - CHECK(stats.get_stddev(10) == doctest::Approx(0.0).epsilon(1e-10)); - CHECK(stats.get_skewness(10) == doctest::Approx(0.0).epsilon(1e-10)); - CHECK(stats.get_kurtosis(10) == doctest::Approx(0.0).epsilon(1e-10)); + CHECK(stats.get_stddev() == doctest::Approx(0.0).epsilon(1e-10)); + CHECK(stats.get_skewness() == doctest::Approx(0.0).epsilon(1e-10)); + CHECK(stats.get_kurtosis() == doctest::Approx(0.0).epsilon(1e-10)); } TEST_CASE("MetricStats - Merge equivalence") { // Single-pass MetricStats single; std::vector all_values = {2, 4, 6, 8, 10, 12, 14, 16}; - for (std::uint64_t i = 0; i < all_values.size(); ++i) { - single.update(all_values[i], i + 1); + for (auto v : all_values) { + single.update(v); } // Split into two halves MetricStats first_half; for (std::uint64_t i = 0; i < 4; ++i) { - first_half.update(all_values[i], i + 1); + first_half.update(all_values[i]); } MetricStats second_half; for (std::uint64_t i = 0; i < 4; ++i) { - second_half.update(all_values[i + 4], i + 1); + second_half.update(all_values[i + 4]); } - std::uint64_t n1 = 4, n2 = 4, n = 8; - first_half.merge_from(second_half, n1, n2, n); + first_half.merge_from(second_half); + CHECK(first_half.count == single.count); CHECK(first_half.mean == doctest::Approx(single.mean).epsilon(0.001)); CHECK(first_half.total == single.total); CHECK(first_half.min == single.min); CHECK(first_half.max == single.max); - CHECK(first_half.get_stddev(n) == - doctest::Approx(single.get_stddev(n)).epsilon(0.01)); + CHECK(first_half.get_stddev() == + doctest::Approx(single.get_stddev()).epsilon(0.01)); } TEST_CASE("MetricStats - Merge with empty") { MetricStats stats; - stats.update(10, 1); - stats.update(20, 2); + stats.update(10); + stats.update(20); MetricStats empty_stats; double mean_before = stats.mean; std::uint64_t total_before = stats.total; + std::uint64_t count_before = stats.count; - stats.merge_from(empty_stats, 2, 0, 2); + stats.merge_from(empty_stats); + CHECK(stats.count == count_before); CHECK(stats.mean == doctest::Approx(mean_before)); CHECK(stats.total == total_before); } @@ -115,9 +120,10 @@ TEST_SUITE("MetricStats") { TEST_CASE("MetricStats - Percentile integration") { MetricStats stats; for (std::uint64_t i = 1; i <= 100; ++i) { - stats.update(i, i, true); // compute_percentiles = true + stats.update(i, true); // compute_percentiles = true } + CHECK(stats.count == 100); CHECK(stats.sketch != nullptr); CHECK_FALSE(stats.sketch->empty()); REQUIRE(stats.sketch != nullptr); @@ -133,20 +139,24 @@ TEST_SUITE("AggregationMetrics") { metrics.update_duration(100); CHECK(metrics.count == 1); + CHECK(metrics.duration.count == 1); CHECK(metrics.duration.total == 100); CHECK(metrics.duration.min == 100); CHECK(metrics.duration.max == 100); metrics.update_duration(200); CHECK(metrics.count == 2); + CHECK(metrics.duration.count == 2); CHECK(metrics.duration.total == 300); metrics.update_size(50); + CHECK(metrics.size.count == 1); CHECK(metrics.size.total == 50); CHECK(metrics.size.min == 50); CHECK(metrics.size.max == 50); metrics.update_size(150); + CHECK(metrics.size.count == 2); CHECK(metrics.size.total == 200); } @@ -191,19 +201,39 @@ TEST_SUITE("AggregationMetrics") { TEST_CASE("AggregationMetrics - update_custom_metric") { AggregationMetrics metrics; - // First call creates the metric metrics.update_duration(100); // increment count to 1 metrics.update_custom_metric("bytes_read", 1024); REQUIRE(metrics.custom_metrics != nullptr); CHECK(metrics.custom_metrics->count("bytes_read") == 1); + CHECK((*metrics.custom_metrics)["bytes_read"].count == 1); CHECK((*metrics.custom_metrics)["bytes_read"].total == 1024); - // Subsequent call updates it metrics.update_duration(200); // count = 2 metrics.update_custom_metric("bytes_read", 2048); + CHECK((*metrics.custom_metrics)["bytes_read"].count == 2); CHECK((*metrics.custom_metrics)["bytes_read"].total == 3072); } + TEST_CASE("AggregationMetrics - sparse custom metrics have correct count") { + AggregationMetrics metrics; + + // 3 events, but only 2 have the custom field + metrics.update_duration(100); + metrics.update_custom_metric("bytes_read", 1024); + + metrics.update_duration(200); + // no bytes_read for this event + + metrics.update_duration(300); + metrics.update_custom_metric("bytes_read", 2048); + + CHECK(metrics.count == 3); + CHECK(metrics.duration.count == 3); + CHECK((*metrics.custom_metrics)["bytes_read"].count == 2); + CHECK((*metrics.custom_metrics)["bytes_read"].mean == + doctest::Approx(1536.0)); // (1024+2048)/2 + } + TEST_CASE("AggregationMetrics - merge_from") { AggregationMetrics a, b; @@ -223,25 +253,48 @@ TEST_SUITE("AggregationMetrics") { a.merge_from(b); CHECK(a.count == 3); + CHECK(a.duration.count == 3); CHECK(a.duration.total == 600); // 100+200+300 + CHECK(a.size.count == 3); CHECK(a.size.total == 450); // 50+150+250 CHECK(a.ts == 500); // min of 1000, 500 CHECK(a.te == 1100); // max of 1100, 700 REQUIRE(a.custom_metrics != nullptr); + CHECK((*a.custom_metrics)["io_ops"].count == 3); CHECK((*a.custom_metrics)["io_ops"].total == 60); } - TEST_CASE("AggregationMetrics - get_stddev delegates") { + TEST_CASE("AggregationMetrics - merge sparse custom metrics") { + AggregationMetrics a, b; + + // a has 2 events, 1 with custom metric + a.update_duration(100); + a.update_custom_metric("bytes", 500); + a.update_duration(200); + + // b has 1 event with custom metric + b.update_duration(300); + b.update_custom_metric("bytes", 1000); + + a.merge_from(b); + + CHECK(a.count == 3); + auto& bytes = (*a.custom_metrics)["bytes"]; + CHECK(bytes.count == 2); // only 2 events had bytes, not 3 + CHECK(bytes.total == 1500); + CHECK(bytes.mean == doctest::Approx(750.0)); + } + + TEST_CASE("AggregationMetrics - get_stddev via MetricStats") { AggregationMetrics metrics; metrics.update_duration(10); metrics.update_duration(20); metrics.update_size(30); metrics.update_size(40); - double dur_stddev = metrics.get_stddev_duration(); - CHECK(dur_stddev == doctest::Approx(metrics.duration.get_stddev(2))); - - double size_stddev = metrics.get_stddev_size(); - CHECK(size_stddev == doctest::Approx(metrics.size.get_stddev(2))); + CHECK(metrics.duration.get_stddev() == + doctest::Approx(std::sqrt(50.0)).epsilon(0.01)); + CHECK(metrics.size.get_stddev() == + doctest::Approx(std::sqrt(50.0)).epsilon(0.01)); } } diff --git a/tests/utilities/composites/dft/aggregators/test_aggregation_serialization.cpp b/tests/utilities/composites/dft/aggregators/test_aggregation_serialization.cpp new file mode 100644 index 00000000..dfb074da --- /dev/null +++ b/tests/utilities/composites/dft/aggregators/test_aggregation_serialization.cpp @@ -0,0 +1,205 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include + +using namespace dftracer::utils::utilities::composites::dft::aggregators; + +TEST_SUITE("AggregationSerialization") { + TEST_CASE("key roundtrip - basic") { + auto& intern = aggregation_intern(); + AggregationKey key; + key.cat_id = intern.get_or_insert("POSIX"); + key.name_id = intern.get_or_insert("read"); + key.pid = 12345; + key.tid = 67890; + key.hhash_id = intern.get_or_insert("abc123"); + key.fhash_id = intern.get_or_insert("def456"); + key.time_bucket = 5000000; + + auto data = serialize_agg_key(42, AggMapType::EVENT, key); + auto result = deserialize_agg_key(data); + + CHECK(result.map_type == AggMapType::EVENT); + CHECK(result.key.cat() == "POSIX"); + CHECK(result.key.name() == "read"); + CHECK(result.key.pid == key.pid); + CHECK(result.key.tid == key.tid); + CHECK(result.key.hhash() == "abc123"); + CHECK(result.key.fhash() == "def456"); + CHECK(result.key.time_bucket == key.time_bucket); + CHECK(result.key.extra_keys == nullptr); + } + + TEST_CASE("key roundtrip - with extra keys") { + auto& intern = aggregation_intern(); + AggregationKey key; + key.cat_id = intern.get_or_insert("MPI"); + key.name_id = intern.get_or_insert("send"); + key.pid = 100; + key.tid = 200; + key.time_bucket = 1000000; + key.extra_keys = std::make_unique< + std::vector>>(); + auto ek_a = intern.get_or_insert("epoch"); + auto ev_a = intern.get_or_insert("1"); + auto ek_b = intern.get_or_insert("step"); + auto ev_b = intern.get_or_insert("42"); + key.extra_keys->emplace_back(ek_a, ev_a); + key.extra_keys->emplace_back(ek_b, ev_b); + + auto data = serialize_agg_key(99, AggMapType::PROFILE, key); + auto result = deserialize_agg_key(data); + + CHECK(result.map_type == AggMapType::PROFILE); + CHECK(result.key.cat() == "MPI"); + REQUIRE(result.key.extra_keys != nullptr); + REQUIRE(result.key.extra_keys->size() == 2); + CHECK(intern.resolve((*result.key.extra_keys)[0].first) == "epoch"); + CHECK(intern.resolve((*result.key.extra_keys)[0].second) == "1"); + CHECK(intern.resolve((*result.key.extra_keys)[1].first) == "step"); + CHECK(intern.resolve((*result.key.extra_keys)[1].second) == "42"); + } + + TEST_CASE("key roundtrip - map type preserved") { + auto& intern = aggregation_intern(); + AggregationKey key; + key.cat_id = intern.get_or_insert("CAT"); + key.name_id = intern.get_or_insert("NAME"); + key.pid = 1; + key.tid = 1; + key.time_bucket = 1000000; + + for (auto mt : + {AggMapType::EVENT, AggMapType::PROFILE, AggMapType::SYSTEM}) { + auto data = serialize_agg_key(0, mt, key); + auto result = deserialize_agg_key(data); + CHECK(result.map_type == mt); + } + } + + TEST_CASE("key sort order - shard prefix") { + auto& intern = aggregation_intern(); + AggregationKey a, b; + a.cat_id = intern.get_or_insert("AAA"); + a.name_id = intern.get_or_insert("aaa"); + a.pid = 1; + a.tid = 1; + a.time_bucket = 1000000; + + b = a; + b.cat_id = intern.get_or_insert("BBB"); + auto ka = serialize_agg_key(0, AggMapType::EVENT, a); + auto kb = serialize_agg_key(0, AggMapType::EVENT, b); + CHECK(ka < kb); + } + + TEST_CASE("key uniqueness - different time_bucket") { + auto& intern = aggregation_intern(); + AggregationKey a, b; + a.cat_id = intern.get_or_insert("AAA"); + a.name_id = intern.get_or_insert("aaa"); + a.pid = 1; + a.tid = 1; + a.time_bucket = 1000000; + + b = a; + b.time_bucket = 2000000; + + auto ka = serialize_agg_key(0, AggMapType::EVENT, a); + auto kb = serialize_agg_key(0, AggMapType::EVENT, b); + CHECK(ka != kb); + } + + TEST_CASE("value roundtrip - basic") { + AggregationMetrics m; + m.count = 100; + m.duration.count = 100; + m.duration.total = 5000; + m.duration.min = 10; + m.duration.max = 200; + m.duration.mean = 50.0; + m.duration.m2 = 1234.5; + m.size.count = 50; + m.size.total = 2000; + m.size.min = 5; + m.size.max = 100; + m.size.mean = 40.0; + m.ts = 1000000; + m.te = 2000000; + m.parent_pid = 42; + + auto data = serialize_agg_value(m); + auto m2 = deserialize_agg_value(data); + + CHECK(m2.count == 100); + CHECK(m2.duration.count == 100); + CHECK(m2.duration.total == 5000); + CHECK(m2.duration.min == 10); + CHECK(m2.duration.max == 200); + CHECK(m2.duration.mean == doctest::Approx(50.0)); + CHECK(m2.duration.m2 == doctest::Approx(1234.5)); + CHECK(m2.size.count == 50); + CHECK(m2.size.total == 2000); + CHECK(m2.ts == 1000000); + CHECK(m2.te == 2000000); + CHECK(m2.parent_pid == 42); + CHECK(m2.custom_metrics == nullptr); + } + + TEST_CASE("value roundtrip - with custom metrics") { + AggregationMetrics m; + m.count = 10; + m.duration.count = 10; + m.duration.total = 500; + m.duration.min = 10; + m.duration.max = 100; + m.duration.mean = 50.0; + m.ts = 100; + m.te = 200; + m.custom_metrics = std::make_unique(); + MetricStats cm; + cm.count = 5; + cm.total = 250; + cm.min = 20; + cm.max = 80; + cm.mean = 50.0; + cm.m2 = 100.0; + m.custom_metrics->emplace("offset", std::move(cm)); + + auto data = serialize_agg_value(m); + auto m2 = deserialize_agg_value(data); + + REQUIRE(m2.custom_metrics != nullptr); + REQUIRE(m2.custom_metrics->count("offset") == 1); + auto& cm2 = m2.custom_metrics->at("offset"); + CHECK(cm2.count == 5); + CHECK(cm2.total == 250); + CHECK(cm2.min == 20); + CHECK(cm2.max == 80); + CHECK(cm2.mean == doctest::Approx(50.0)); + } + + TEST_CASE("value roundtrip - with sketch") { + AggregationMetrics m; + m.count = 3; + m.duration.count = 3; + m.duration.total = 300; + m.duration.min = 50; + m.duration.max = 150; + m.duration.mean = 100.0; + m.ts = 100; + m.te = 200; + + m.duration.update(50, true); + m.duration.update(100, true); + m.duration.update(150, true); + + REQUIRE(m.duration.sketch != nullptr); + + auto data = serialize_agg_value(m); + auto m2 = deserialize_agg_value(data); + + REQUIRE(m2.duration.sketch != nullptr); + CHECK(m2.duration.sketch->count() == m.duration.sketch->count()); + } +} diff --git a/tests/utilities/composites/dft/aggregators/test_aggregator_utility.cpp b/tests/utilities/composites/dft/aggregators/test_aggregator_utility.cpp index ab59b514..e6df7b44 100644 --- a/tests/utilities/composites/dft/aggregators/test_aggregator_utility.cpp +++ b/tests/utilities/composites/dft/aggregators/test_aggregator_utility.cpp @@ -1,6 +1,10 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include +#include +#include +#include +#include #include #include #include @@ -8,31 +12,20 @@ #include #include +using namespace dftracer::utils; using namespace dftracer::utils::utilities::composites::dft::aggregators; using namespace dftracer::utils::coro; using namespace dft_utils_test; -namespace { - -static CoroTask> collect_batches( - AsyncGenerator gen) { - std::vector batches; - while (auto batch = co_await gen.next()) { - batches.push_back(std::move(*batch)); - } - co_return batches; -} - -} // namespace - TEST_SUITE("AggregatorUtility") { TEST_CASE("Collects event profile and system counter batches end-to-end") { TestEnvironment env(0); REQUIRE(env.is_valid()); - auto trace = fs::path(env.get_dir()) / "mixed_trace.pfw"; + auto trace_plain = fs::path(env.get_dir()) / "mixed_trace.pfw"; + auto trace = fs::path(env.get_dir()) / "mixed_trace.pfw.gz"; { - std::ofstream out(trace); + std::ofstream out(trace_plain); out << R"({"name":"read","cat":"POSIX","pid":7,"tid":3,"ts":1000,"dur":50,"ph":"X","args":{"ret":64,"bytes":64,"hhash":"event_h","fhash":"event_f"}})" << "\n"; out << R"({"name":"cpu_usage","cat":"PROFILE","pid":7,"tid":3,"ts":1500,"dur":0,"ph":"C","args":{"count":4,"dur_sum":80,"dur_min":10,"dur_max":30,"ret_sum":400,"ret_min":50,"ret_max":150,"bytes_sum":1000,"bytes_min":100,"bytes_max":400,"hhash":"profile_h","fhash":"profile_f"}})" @@ -40,6 +33,8 @@ TEST_SUITE("AggregatorUtility") { out << R"({"name":"mem_bw","cat":"sys","pid":7,"tid":3,"ts":2500,"dur":0,"ph":"C","args":{"count":2,"dur_sum":40,"dur_min":15,"dur_max":25,"ret_sum":600,"ret_min":250,"ret_max":350,"bytes_sum":1200,"bytes_min":500,"bytes_max":700,"hhash":"system_h","fhash":"system_f"}})" << "\n"; } + REQUIRE(compress_file_to_gzip(trace_plain.string(), trace.string())); + fs::remove(trace_plain); AggregatorInput input; input.directory = env.get_dir(); @@ -49,8 +44,25 @@ TEST_SUITE("AggregatorUtility") { input.config.custom_metric_fields = {"bytes"}; input.config.track_process_parents = false; - auto batches = - collect_batches(AggregatorUtility{}.process(input)).get(); + Executor executor(ExecutorConfig{.num_threads = 2}); + Scheduler scheduler(&executor); + + std::vector batches; + auto task = make_task( + [&](CoroScope& ctx) -> coro::CoroTask { + AggregatorUtility agg; + agg.bind_context(ctx); + auto gen = agg.process(input); + while (auto batch = co_await gen.next()) { + batches.push_back(std::move(*batch)); + } + agg.unbind_context(); + }, + "AggregatorTest"); + + scheduler.schedule(task); + task->wait(); + executor.shutdown(); REQUIRE(batches.size() == 3); @@ -77,32 +89,30 @@ TEST_SUITE("AggregatorUtility") { CHECK(profile_batch->entries.size() == 1); CHECK(system_batch->entries.size() == 1); - const auto& [event_key, event_metrics] = event_batch->entries.front(); - CHECK(event_key.cat() == "POSIX"); - CHECK(event_key.name() == "read"); - CHECK(event_metrics.count == 1); - CHECK(event_metrics.duration.total == 50); - CHECK(event_metrics.size.total == 64); - - const auto& [profile_key, profile_metrics] = - profile_batch->entries.front(); - CHECK(profile_key.cat() == "PROFILE"); - CHECK(profile_key.name() == "cpu_usage"); - CHECK(profile_metrics.count == 4); - CHECK(profile_metrics.duration.total == 80); - CHECK(profile_metrics.size.total == 400); - REQUIRE(profile_metrics.custom_metrics != nullptr); - CHECK((*profile_metrics.custom_metrics)["bytes"].total == 1000); - - const auto& [system_key, system_metrics] = - system_batch->entries.front(); - CHECK(system_key.cat() == "sys"); - CHECK(system_key.name() == "mem_bw"); - CHECK(system_metrics.count == 2); - CHECK(system_metrics.duration.total == 40); - CHECK(system_metrics.size.total == 600); - REQUIRE(system_metrics.custom_metrics != nullptr); - CHECK((*system_metrics.custom_metrics)["bytes"].total == 1200); + const auto& event_entry = event_batch->entries.front(); + CHECK(event_entry.key.cat() == "POSIX"); + CHECK(event_entry.key.name() == "read"); + CHECK(event_entry.metrics.count == 1); + CHECK(event_entry.metrics.duration.total == 50); + CHECK(event_entry.metrics.size.total == 64); + + const auto& profile_entry = profile_batch->entries.front(); + CHECK(profile_entry.key.cat() == "PROFILE"); + CHECK(profile_entry.key.name() == "cpu_usage"); + CHECK(profile_entry.metrics.count == 4); + CHECK(profile_entry.metrics.duration.total == 80); + CHECK(profile_entry.metrics.size.total == 400); + REQUIRE(profile_entry.metrics.custom_metrics != nullptr); + CHECK((*profile_entry.metrics.custom_metrics)["bytes"].total == 1000); + + const auto& system_entry = system_batch->entries.front(); + CHECK(system_entry.key.cat() == "sys"); + CHECK(system_entry.key.name() == "mem_bw"); + CHECK(system_entry.metrics.count == 2); + CHECK(system_entry.metrics.duration.total == 40); + CHECK(system_entry.metrics.size.total == 600); + REQUIRE(system_entry.metrics.custom_metrics != nullptr); + CHECK((*system_entry.metrics.custom_metrics)["bytes"].total == 1200); CHECK(event_batch->total_events_processed == 3); CHECK(profile_batch->total_events_processed == 3); diff --git a/tests/utilities/composites/dft/aggregators/test_event_aggregator_utility.cpp b/tests/utilities/composites/dft/aggregators/test_event_aggregator_utility.cpp index 490bc219..9ab786c9 100644 --- a/tests/utilities/composites/dft/aggregators/test_event_aggregator_utility.cpp +++ b/tests/utilities/composites/dft/aggregators/test_event_aggregator_utility.cpp @@ -1,5 +1,5 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN -#include +#include #include using namespace dftracer::utils::utilities::composites::dft::aggregators; @@ -28,7 +28,7 @@ AggregationMetrics make_metrics(std::uint64_t count, std::uint64_t dur_total) { } // namespace -TEST_SUITE("EventAggregatorUtility") { +TEST_SUITE("EventAggregator") { TEST_CASE("Merges event profile and system maps independently") { ChunkAggregationOutput first; first.success = true; @@ -50,7 +50,7 @@ TEST_SUITE("EventAggregatorUtility") { second.profile_aggregations.emplace(make_key("PROFILE", "cpu"), make_metrics(1, 30)); - EventAggregatorUtility utility; + EventAggregator utility; utility.merge_chunk(std::move(first)); utility.merge_chunk(std::move(second)); auto output = utility.finalize(); diff --git a/tests/utilities/composites/dft/aggregators/test_system_metrics.cpp b/tests/utilities/composites/dft/aggregators/test_system_metrics.cpp new file mode 100644 index 00000000..a30880ad --- /dev/null +++ b/tests/utilities/composites/dft/aggregators/test_system_metrics.cpp @@ -0,0 +1,309 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include + +#include +#include + +using namespace dftracer::utils::utilities::composites::dft::aggregators; + +TEST_SUITE("FloatMetricStats") { + TEST_CASE("default construction") { + FloatMetricStats stats; + CHECK(stats.count == 0); + CHECK(stats.total == 0.0); + CHECK(stats.min == std::numeric_limits::max()); + CHECK(stats.max == std::numeric_limits::lowest()); + CHECK(stats.mean == 0.0); + CHECK(stats.m2 == 0.0); + CHECK(stats.sketch == nullptr); + } + + TEST_CASE("single value update") { + FloatMetricStats stats; + stats.update(42.5); + + CHECK(stats.count == 1); + CHECK(stats.total == doctest::Approx(42.5)); + CHECK(stats.min == doctest::Approx(42.5)); + CHECK(stats.max == doctest::Approx(42.5)); + CHECK(stats.mean == doctest::Approx(42.5)); + CHECK(stats.get_stddev() == doctest::Approx(0.0)); + } + + TEST_CASE("multiple values update") { + FloatMetricStats stats; + stats.update(10.0); + stats.update(20.0); + stats.update(30.0); + + CHECK(stats.count == 3); + CHECK(stats.total == doctest::Approx(60.0)); + CHECK(stats.min == doctest::Approx(10.0)); + CHECK(stats.max == doctest::Approx(30.0)); + CHECK(stats.mean == doctest::Approx(20.0)); + CHECK(stats.get_stddev() == doctest::Approx(10.0)); + } + + TEST_CASE("update with percentiles") { + FloatMetricStats stats; + stats.update(10.0, true); + stats.update(20.0, true); + stats.update(30.0, true); + + CHECK(stats.sketch != nullptr); + CHECK(stats.sketch->quantile(0.5) == + doctest::Approx(20.0).epsilon(0.1)); + } + + TEST_CASE("merge_from empty into empty") { + FloatMetricStats a, b; + a.merge_from(b); + + CHECK(a.count == 0); + CHECK(a.total == 0.0); + } + + TEST_CASE("merge_from populated into empty") { + FloatMetricStats a, b; + b.update(10.0); + b.update(20.0); + + a.merge_from(b); + + CHECK(a.count == 2); + CHECK(a.total == doctest::Approx(30.0)); + CHECK(a.min == doctest::Approx(10.0)); + CHECK(a.max == doctest::Approx(20.0)); + CHECK(a.mean == doctest::Approx(15.0)); + } + + TEST_CASE("merge_from two populated stats") { + FloatMetricStats a, b; + a.update(10.0); + a.update(20.0); + b.update(30.0); + b.update(40.0); + + a.merge_from(b); + + CHECK(a.count == 4); + CHECK(a.total == doctest::Approx(100.0)); + CHECK(a.min == doctest::Approx(10.0)); + CHECK(a.max == doctest::Approx(40.0)); + CHECK(a.mean == doctest::Approx(25.0)); + } + + TEST_CASE("merge_from with sketches") { + FloatMetricStats a, b; + a.update(10.0, true); + a.update(20.0, true); + b.update(30.0, true); + b.update(40.0, true); + + a.merge_from(b); + + CHECK(a.sketch != nullptr); + CHECK(a.count == 4); + } + + TEST_CASE("copy construction") { + FloatMetricStats original; + original.update(10.0, true); + original.update(20.0, true); + + FloatMetricStats copy(original); + + CHECK(copy.count == original.count); + CHECK(copy.total == original.total); + CHECK(copy.min == original.min); + CHECK(copy.max == original.max); + CHECK(copy.mean == original.mean); + CHECK(copy.sketch != nullptr); + CHECK(copy.sketch != original.sketch); + } +} + +TEST_SUITE("SystemAggregationMetrics") { + TEST_CASE("default construction") { + SystemAggregationMetrics metrics; + CHECK(metrics.count == 0); + CHECK(metrics.ts == std::numeric_limits::max()); + CHECK(metrics.te == 0); + CHECK(metrics.metrics == nullptr); + } + + TEST_CASE("update_metric creates metrics map") { + SystemAggregationMetrics metrics; + metrics.update_metric("cpu_usage", 50.0); + + CHECK(metrics.metrics != nullptr); + CHECK(metrics.metrics->size() == 1); + CHECK(metrics.metrics->at("cpu_usage").count == 1); + CHECK(metrics.metrics->at("cpu_usage").mean == doctest::Approx(50.0)); + } + + TEST_CASE("update_metric multiple metrics") { + SystemAggregationMetrics metrics; + metrics.update_metric("cpu_usage", 50.0); + metrics.update_metric("memory_usage", 70.0); + metrics.update_metric("cpu_usage", 60.0); + + CHECK(metrics.metrics->size() == 2); + CHECK(metrics.metrics->at("cpu_usage").count == 2); + CHECK(metrics.metrics->at("cpu_usage").mean == doctest::Approx(55.0)); + CHECK(metrics.metrics->at("memory_usage").count == 1); + } + + TEST_CASE("update_timestamp") { + SystemAggregationMetrics metrics; + metrics.update_timestamp(1000); + metrics.update_timestamp(500); + metrics.update_timestamp(1500); + + CHECK(metrics.ts == 500); + CHECK(metrics.te == 1500); + } + + TEST_CASE("merge_from empty into empty") { + SystemAggregationMetrics a, b; + a.merge_from(b); + + CHECK(a.count == 0); + CHECK(a.metrics == nullptr); + } + + TEST_CASE("merge_from populated into empty") { + SystemAggregationMetrics a, b; + b.count = 2; + b.ts = 100; + b.te = 200; + b.update_metric("cpu", 50.0); + + a.merge_from(b); + + CHECK(a.count == 2); + CHECK(a.ts == 100); + CHECK(a.te == 200); + CHECK(a.metrics != nullptr); + CHECK(a.metrics->at("cpu").count == 1); + } + + TEST_CASE("merge_from two populated metrics") { + SystemAggregationMetrics a, b; + a.count = 2; + a.ts = 100; + a.te = 200; + a.update_metric("cpu", 40.0); + a.update_metric("cpu", 60.0); + + b.count = 2; + b.ts = 50; + b.te = 250; + b.update_metric("cpu", 50.0); + b.update_metric("memory", 80.0); + + a.merge_from(b); + + CHECK(a.count == 4); + CHECK(a.ts == 50); + CHECK(a.te == 250); + CHECK(a.metrics->size() == 2); + CHECK(a.metrics->at("cpu").count == 3); + CHECK(a.metrics->at("memory").count == 1); + } + + TEST_CASE("copy construction") { + SystemAggregationMetrics original; + original.count = 5; + original.ts = 100; + original.te = 500; + original.update_metric("cpu", 50.0); + + SystemAggregationMetrics copy(original); + + CHECK(copy.count == original.count); + CHECK(copy.ts == original.ts); + CHECK(copy.te == original.te); + CHECK(copy.metrics != nullptr); + CHECK(copy.metrics != original.metrics); + CHECK(copy.metrics->at("cpu").count == 1); + } +} + +TEST_SUITE("SystemMetricsSerialization") { + TEST_CASE("key serialization round-trip") { + std::string hhash = "host123"; + std::uint64_t time_bucket = 42; + + std::string serialized = serialize_system_key(hhash, time_bucket); + auto deserialized = deserialize_system_key(serialized); + + CHECK(deserialized.key.hhash == hhash); + CHECK(deserialized.key.time_bucket == time_bucket); + } + + TEST_CASE("value serialization round-trip - empty metrics") { + SystemAggregationMetrics original; + original.count = 10; + original.ts = 1000; + original.te = 2000; + + std::string serialized = serialize_system_value(original); + auto deserialized = deserialize_system_value(serialized); + + CHECK(deserialized.count == original.count); + CHECK(deserialized.ts == original.ts); + CHECK(deserialized.te == original.te); + CHECK(deserialized.metrics == nullptr); + } + + TEST_CASE("value serialization round-trip - with metrics") { + SystemAggregationMetrics original; + original.count = 10; + original.ts = 1000; + original.te = 2000; + original.update_metric("cpu_user", 25.5); + original.update_metric("cpu_user", 30.0); + original.update_metric("cpu_system", 5.0); + original.update_metric("memory_available", 8000000.0); + + std::string serialized = serialize_system_value(original); + auto deserialized = deserialize_system_value(serialized); + + CHECK(deserialized.count == original.count); + CHECK(deserialized.ts == original.ts); + CHECK(deserialized.te == original.te); + REQUIRE(deserialized.metrics != nullptr); + CHECK(deserialized.metrics->size() == 3); + + auto& cpu_user = deserialized.metrics->at("cpu_user"); + CHECK(cpu_user.count == 2); + CHECK(cpu_user.mean == doctest::Approx(27.75)); + CHECK(cpu_user.min == doctest::Approx(25.5)); + CHECK(cpu_user.max == doctest::Approx(30.0)); + + auto& cpu_system = deserialized.metrics->at("cpu_system"); + CHECK(cpu_system.count == 1); + CHECK(cpu_system.mean == doctest::Approx(5.0)); + + auto& memory = deserialized.metrics->at("memory_available"); + CHECK(memory.count == 1); + CHECK(memory.mean == doctest::Approx(8000000.0)); + } + + TEST_CASE("value serialization preserves variance") { + SystemAggregationMetrics original; + original.count = 3; + original.update_metric("test", 10.0); + original.update_metric("test", 20.0); + original.update_metric("test", 30.0); + + std::string serialized = serialize_system_value(original); + auto deserialized = deserialize_system_value(serialized); + + auto& test_stats = deserialized.metrics->at("test"); + CHECK(test_stats.get_stddev() == doctest::Approx(10.0)); + } +} diff --git a/tests/utilities/composites/dft/aggregators/test_system_metrics_merge_operator.cpp b/tests/utilities/composites/dft/aggregators/test_system_metrics_merge_operator.cpp new file mode 100644 index 00000000..b96008e8 --- /dev/null +++ b/tests/utilities/composites/dft/aggregators/test_system_metrics_merge_operator.cpp @@ -0,0 +1,183 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace dftracer::utils::utilities::composites::dft::aggregators; + +TEST_SUITE("SystemMetricsMergeOperator") { + TEST_CASE("Name returns correct identifier") { + SystemMetricsMergeOperator op; + CHECK(std::string(op.Name()) == "SystemMetricsMergeOperator"); + } + + TEST_CASE("PartialMerge combines two operands") { + SystemMetricsMergeOperator op; + + // Create two system metrics + SystemAggregationMetrics left; + left.count = 2; + left.ts = 100; + left.te = 200; + left.update_metric("cpu_user", 40.0); + left.update_metric("cpu_user", 50.0); + + SystemAggregationMetrics right; + right.count = 1; + right.ts = 150; + right.te = 250; + right.update_metric("cpu_user", 60.0); + right.update_metric("memory", 1000.0); + + std::string left_serialized = serialize_system_value(left); + std::string right_serialized = serialize_system_value(right); + + std::string new_value; + ::rocksdb::Slice key("test_key"); + ::rocksdb::Slice left_slice(left_serialized); + ::rocksdb::Slice right_slice(right_serialized); + + bool result = + op.PartialMerge(key, left_slice, right_slice, &new_value, nullptr); + CHECK(result); + + auto merged = deserialize_system_value(new_value); + CHECK(merged.count == 3); + CHECK(merged.ts == 100); + CHECK(merged.te == 250); + REQUIRE(merged.metrics != nullptr); + CHECK(merged.metrics->size() == 2); + CHECK(merged.metrics->at("cpu_user").count == 3); + CHECK(merged.metrics->at("memory").count == 1); + } + + TEST_CASE("FullMergeV2 merges existing value with operands") { + SystemMetricsMergeOperator op; + + // Existing value + SystemAggregationMetrics existing; + existing.count = 1; + existing.ts = 50; + existing.te = 100; + existing.update_metric("cpu", 30.0); + std::string existing_serialized = serialize_system_value(existing); + + // First operand + SystemAggregationMetrics op1; + op1.count = 1; + op1.ts = 100; + op1.te = 150; + op1.update_metric("cpu", 40.0); + std::string op1_serialized = serialize_system_value(op1); + + // Second operand + SystemAggregationMetrics op2; + op2.count = 1; + op2.ts = 150; + op2.te = 200; + op2.update_metric("cpu", 50.0); + std::string op2_serialized = serialize_system_value(op2); + + ::rocksdb::Slice key("test_key"); + ::rocksdb::Slice existing_slice(existing_serialized); + std::vector<::rocksdb::Slice> operands = { + ::rocksdb::Slice(op1_serialized), ::rocksdb::Slice(op2_serialized)}; + + ::rocksdb::MergeOperator::MergeOperationInput merge_in( + key, &existing_slice, operands, nullptr); + std::string new_value; + ::rocksdb::Slice existing_operand; + ::rocksdb::MergeOperator::MergeOperationOutput merge_out( + new_value, existing_operand); + + bool result = op.FullMergeV2(merge_in, &merge_out); + CHECK(result); + + auto merged = deserialize_system_value(new_value); + CHECK(merged.count == 3); + CHECK(merged.ts == 50); + CHECK(merged.te == 200); + REQUIRE(merged.metrics != nullptr); + CHECK(merged.metrics->at("cpu").count == 3); + CHECK(merged.metrics->at("cpu").mean == doctest::Approx(40.0)); + } + + TEST_CASE("FullMergeV2 handles null existing value") { + SystemMetricsMergeOperator op; + + // First operand + SystemAggregationMetrics op1; + op1.count = 2; + op1.ts = 100; + op1.te = 200; + op1.update_metric("memory", 1000.0); + std::string op1_serialized = serialize_system_value(op1); + + // Second operand + SystemAggregationMetrics op2; + op2.count = 3; + op2.ts = 200; + op2.te = 300; + op2.update_metric("memory", 2000.0); + std::string op2_serialized = serialize_system_value(op2); + + ::rocksdb::Slice key("test_key"); + std::vector<::rocksdb::Slice> operands = { + ::rocksdb::Slice(op1_serialized), ::rocksdb::Slice(op2_serialized)}; + + ::rocksdb::MergeOperator::MergeOperationInput merge_in( + key, nullptr, operands, nullptr); + std::string new_value; + ::rocksdb::Slice existing_operand; + ::rocksdb::MergeOperator::MergeOperationOutput merge_out( + new_value, existing_operand); + + bool result = op.FullMergeV2(merge_in, &merge_out); + CHECK(result); + + auto merged = deserialize_system_value(new_value); + CHECK(merged.count == 5); + CHECK(merged.ts == 100); + CHECK(merged.te == 300); + REQUIRE(merged.metrics != nullptr); + CHECK(merged.metrics->at("memory").count == 2); + } + + TEST_CASE("FullMergeV2 handles single operand") { + SystemMetricsMergeOperator op; + + SystemAggregationMetrics op1; + op1.count = 5; + op1.ts = 100; + op1.te = 500; + op1.update_metric("disk_io", 100.0); + std::string op1_serialized = serialize_system_value(op1); + + ::rocksdb::Slice key("test_key"); + std::vector<::rocksdb::Slice> operands = { + ::rocksdb::Slice(op1_serialized)}; + + ::rocksdb::MergeOperator::MergeOperationInput merge_in( + key, nullptr, operands, nullptr); + std::string new_value; + ::rocksdb::Slice existing_operand; + ::rocksdb::MergeOperator::MergeOperationOutput merge_out( + new_value, existing_operand); + + bool result = op.FullMergeV2(merge_in, &merge_out); + CHECK(result); + + auto merged = deserialize_system_value(new_value); + CHECK(merged.count == 5); + CHECK(merged.ts == 100); + CHECK(merged.te == 500); + REQUIRE(merged.metrics != nullptr); + CHECK(merged.metrics->at("disk_io").count == 1); + } +} diff --git a/tests/utilities/composites/dft/comparator/test_comparison_result.cpp b/tests/utilities/composites/dft/comparator/test_comparison_result.cpp index 32737a80..585ee2e0 100644 --- a/tests/utilities/composites/dft/comparator/test_comparison_result.cpp +++ b/tests/utilities/composites/dft/comparator/test_comparison_result.cpp @@ -5,19 +5,29 @@ #include #include -#include using namespace dftracer::utils::utilities::composites::dft::comparator; using namespace dftracer::utils::utilities::composites::dft::aggregators; -static MetricStats make_stats(double mean, double m2, uint64_t total, - uint64_t min_val, uint64_t max_val) { +// MetricStats representation change: `m2` now holds the raw power sum +// `sum_x^2` (not Welford central M2). The caller passes `central_m2` +// (central moment, = sum((x-mean)^2)); we translate it to raw via +// raw_sum_x^2 = central_m2 + n * mean^2 +// so callers keep Welford semantics but we store the new canonical form. +static MetricStats make_stats(double mean, double central_m2, uint64_t total, + uint64_t min_val, uint64_t max_val, + uint64_t count = 0) { MetricStats s; s.mean = mean; - s.m2 = m2; s.total = total; s.min = min_val; s.max = max_val; + s.count = count; + // If count not provided, fall back to total/mean ratio (integer-rounded). + const double n = + count > 0 ? static_cast(count) + : (mean != 0.0 ? static_cast(total) / mean : 0.0); + s.m2 = central_m2 + n * mean * mean; return s; } diff --git a/tests/utilities/composites/dft/indexing/test_bloom_query.cpp b/tests/utilities/composites/dft/indexing/test_bloom_query.cpp index c27c9b96..abea71f1 100644 --- a/tests/utilities/composites/dft/indexing/test_bloom_query.cpp +++ b/tests/utilities/composites/dft/indexing/test_bloom_query.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -19,17 +20,13 @@ using dftracer::utils::utilities::indexer::internal::get_logical_path; static void populate_test_idx(const std::string& index_path, const std::string& file_path) { IndexDatabase idx_db(index_path); - idx_db.init_base_schema(); - idx_db.init_bloom_schema(); + auto writer = idx_db.begin_write(); + writer->init_schema(); int fid = - idx_db.get_or_create_file_info(get_logical_path(file_path), 12345); + writer->get_or_create_file_info(get_logical_path(file_path), 12345); - idx_db.begin_transaction(); - - // Create chunk bloom filters for 3 checkpoints for (int ckpt = 0; ckpt < 3; ++ckpt) { - // name dimension BloomFilter name_bloom(100, 0.01); if (ckpt == 0) { name_bloom.add("read"); @@ -43,11 +40,10 @@ static void populate_test_idx(const std::string& index_path, } auto blob = name_bloom.serialize(); - idx_db.insert_chunk_bloom_filter( + writer->insert_chunk_bloom_filter( fid, static_cast(ckpt), "name", blob.data(), static_cast(blob.size()), name_bloom.num_entries()); - // cat dimension BloomFilter cat_bloom(100, 0.01); if (ckpt == 0 || ckpt == 2) { cat_bloom.add("POSIX"); @@ -56,12 +52,11 @@ static void populate_test_idx(const std::string& index_path, } auto cat_blob = cat_bloom.serialize(); - idx_db.insert_chunk_bloom_filter( + writer->insert_chunk_bloom_filter( fid, static_cast(ckpt), "cat", cat_blob.data(), static_cast(cat_blob.size()), cat_bloom.num_entries()); } - // Create file-level bloom filters (merged from all chunks) BloomFilter file_name_bloom(100, 0.01); file_name_bloom.add("read"); file_name_bloom.add("write"); @@ -69,42 +64,38 @@ static void populate_test_idx(const std::string& index_path, file_name_bloom.add("close"); file_name_bloom.add("stat"); auto name_blob = file_name_bloom.serialize(); - idx_db.insert_file_bloom_filter(fid, "name", name_blob.data(), - static_cast(name_blob.size()), - file_name_bloom.num_entries()); + writer->insert_file_bloom_filter(fid, "name", name_blob.data(), + static_cast(name_blob.size()), + file_name_bloom.num_entries()); BloomFilter file_cat_bloom(100, 0.01); file_cat_bloom.add("POSIX"); file_cat_bloom.add("storage"); auto cat_blob = file_cat_bloom.serialize(); - idx_db.insert_file_bloom_filter(fid, "cat", cat_blob.data(), - static_cast(cat_blob.size()), - file_cat_bloom.num_entries()); + writer->insert_file_bloom_filter(fid, "cat", cat_blob.data(), + static_cast(cat_blob.size()), + file_cat_bloom.num_entries()); - // Add fhash with resolution BloomFilter fhash_bloom(100, 0.01); fhash_bloom.add("abc123"); auto fhash_blob = fhash_bloom.serialize(); - idx_db.insert_file_bloom_filter(fid, "fhash", fhash_blob.data(), - static_cast(fhash_blob.size()), - fhash_bloom.num_entries()); + writer->insert_file_bloom_filter(fid, "fhash", fhash_blob.data(), + static_cast(fhash_blob.size()), + fhash_bloom.num_entries()); for (int ckpt = 0; ckpt < 3; ++ckpt) { auto blob = fhash_bloom.serialize(); - idx_db.insert_chunk_bloom_filter( + writer->insert_chunk_bloom_filter( fid, static_cast(ckpt), "fhash", blob.data(), static_cast(blob.size()), fhash_bloom.num_entries()); } - // Hash resolutions - idx_db.insert_hash_resolution(fid, "fhash", "abc123", "./data/file.h5"); - - // Record dimensions - idx_db.insert_index_dimension(fid, "name"); - idx_db.insert_index_dimension(fid, "cat"); - idx_db.insert_index_dimension(fid, "fhash"); + writer->insert_hash_table_entry(0, "abc123", "./data/file.h5"); - idx_db.commit_transaction(); + writer->insert_index_dimension(fid, "name"); + writer->insert_index_dimension(fid, "cat"); + writer->insert_index_dimension(fid, "fhash"); + writer->commit(); } TEST_SUITE("BloomQueryUtility") { diff --git a/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp b/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp index 5a606ad8..e4dbad4b 100644 --- a/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp +++ b/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -20,89 +21,84 @@ using dftracer::utils::utilities::indexer::internal::get_logical_path; static void populate_test_idx(const std::string& index_path, const std::string& file_path) { IndexDatabase idx_db(index_path); - idx_db.init_base_schema(); - idx_db.init_bloom_schema(); + auto writer = idx_db.begin_write(); + writer->init_schema(); int fid = - idx_db.get_or_create_file_info(get_logical_path(file_path), 12345); + writer->get_or_create_file_info(get_logical_path(file_path), 12345); - idx_db.begin_transaction(); - - // Chunk 0: POSIX reads, dur 100-200 { ChunkDimensionStats cat_ds; cat_ds.dimension = "cat"; cat_ds.value_type = "string"; cat_ds.observe("POSIX"); cat_ds.observe("POSIX"); - idx_db.insert_chunk_dimension_stats(fid, 0, cat_ds); + writer->insert_chunk_dimension_stats(fid, 0, cat_ds); ChunkDimensionStats name_ds; name_ds.dimension = "name"; name_ds.value_type = "string"; name_ds.observe("read"); name_ds.observe("read"); - idx_db.insert_chunk_dimension_stats(fid, 0, name_ds); + writer->insert_chunk_dimension_stats(fid, 0, name_ds); ChunkDimensionStats dur_ds; dur_ds.dimension = "dur"; dur_ds.value_type = "uint"; dur_ds.observe("100"); dur_ds.observe("200"); - idx_db.insert_chunk_dimension_stats(fid, 0, dur_ds); + writer->insert_chunk_dimension_stats(fid, 0, dur_ds); - idx_db.insert_index_dimension(fid, "cat"); - idx_db.insert_index_dimension(fid, "name"); - idx_db.insert_index_dimension(fid, "dur"); + writer->insert_index_dimension(fid, "cat"); + writer->insert_index_dimension(fid, "name"); + writer->insert_index_dimension(fid, "dur"); } - // Chunk 1: STDIO writes, dur 500-600 { ChunkDimensionStats cat_ds; cat_ds.dimension = "cat"; cat_ds.value_type = "string"; cat_ds.observe("STDIO"); - idx_db.insert_chunk_dimension_stats(fid, 1, cat_ds); + writer->insert_chunk_dimension_stats(fid, 1, cat_ds); ChunkDimensionStats name_ds; name_ds.dimension = "name"; name_ds.value_type = "string"; name_ds.observe("write"); - idx_db.insert_chunk_dimension_stats(fid, 1, name_ds); + writer->insert_chunk_dimension_stats(fid, 1, name_ds); ChunkDimensionStats dur_ds; dur_ds.dimension = "dur"; dur_ds.value_type = "uint"; dur_ds.observe("500"); dur_ds.observe("600"); - idx_db.insert_chunk_dimension_stats(fid, 1, dur_ds); + writer->insert_chunk_dimension_stats(fid, 1, dur_ds); } - // Chunk 2: POSIX + MPI mixed, dur 50-1000 { ChunkDimensionStats cat_ds; cat_ds.dimension = "cat"; cat_ds.value_type = "string"; cat_ds.observe("POSIX"); cat_ds.observe("MPI"); - idx_db.insert_chunk_dimension_stats(fid, 2, cat_ds); + writer->insert_chunk_dimension_stats(fid, 2, cat_ds); ChunkDimensionStats name_ds; name_ds.dimension = "name"; name_ds.value_type = "string"; name_ds.observe("read"); name_ds.observe("send"); - idx_db.insert_chunk_dimension_stats(fid, 2, name_ds); + writer->insert_chunk_dimension_stats(fid, 2, name_ds); ChunkDimensionStats dur_ds; dur_ds.dimension = "dur"; dur_ds.value_type = "uint"; dur_ds.observe("50"); dur_ds.observe("1000"); - idx_db.insert_chunk_dimension_stats(fid, 2, dur_ds); + writer->insert_chunk_dimension_stats(fid, 2, dur_ds); } - idx_db.commit_transaction(); + writer->commit(); } static ChunkPrunerOutput run_pruner(const std::string& index_path, diff --git a/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp b/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp index 3761d6a2..02d17a08 100644 --- a/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp +++ b/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp @@ -86,8 +86,7 @@ TEST_SUITE("ManifestIndexBuilder") { auto config = IndexBuildConfig::for_file(trace_file) .with_index_dir(test_dir) - .with_manifest(true) - .with_index_threshold(0); + .with_manifest(true); auto result = run_index_build(config); @@ -97,8 +96,7 @@ TEST_SUITE("ManifestIndexBuilder") { CHECK(fs::exists(result.index_path)); IndexDatabase idx_db(result.index_path); - idx_db.init_base_schema(); - idx_db.init_manifest_schema(); + idx_db.init_schema(); int fid = idx_db.get_file_info_id(get_logical_path(trace_file)); REQUIRE(fid >= 0); @@ -136,7 +134,6 @@ TEST_SUITE("ManifestIndexBuilder") { auto config = IndexBuildConfig::for_file(trace_file) .with_index_dir(test_dir) .with_manifest(true) - .with_index_threshold(0) .with_force_rebuild(false); auto result = run_index_build(config); @@ -149,7 +146,6 @@ TEST_SUITE("ManifestIndexBuilder") { auto config = IndexBuildConfig::for_file(trace_file) .with_index_dir(test_dir) .with_manifest(true) - .with_index_threshold(0) .with_force_rebuild(false); auto result = run_index_build(config); diff --git a/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp b/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp index b7450e18..d50c5a8b 100644 --- a/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp +++ b/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -33,19 +34,19 @@ TEST_SUITE("ManifestQueries") { std::string index_path = test_dir + "/test.pfw.gz.idx"; IndexDatabase idx_db(index_path); - idx_db.init_base_schema(); - idx_db.init_manifest_schema(); - int fid = - idx_db.get_or_create_file_info(get_logical_path("test.pfw.gz"), 0); - - idx_db.begin_transaction(); - - idx_db.insert_event_range(fid, 0, "POSIX", "read", {0, 2, 5}); - idx_db.insert_event_range(fid, 0, "POSIX", "write", {1}); - idx_db.insert_event_range(fid, 0, "APP", "compute", {3, 4}); - idx_db.insert_event_range(fid, 1, "POSIX", "read", {0, 1}); - - idx_db.commit_transaction(); + int fid; + { + auto writer = idx_db.begin_write(); + writer->init_schema(); + fid = writer->get_or_create_file_info( + get_logical_path("test.pfw.gz"), 0); + + writer->insert_event_range(fid, 0, "POSIX", "read", {0, 2, 5}); + writer->insert_event_range(fid, 0, "POSIX", "write", {1}); + writer->insert_event_range(fid, 0, "APP", "compute", {3, 4}); + writer->insert_event_range(fid, 1, "POSIX", "read", {0, 1}); + writer->commit(); + } auto all = idx_db.query_event_ranges(fid); CHECK(all.size() == 4); @@ -71,18 +72,18 @@ TEST_SUITE("ManifestQueries") { std::string index_path = test_dir + "/test.pfw.gz.idx"; IndexDatabase idx_db(index_path); - idx_db.init_base_schema(); - idx_db.init_manifest_schema(); - int fid = - idx_db.get_or_create_file_info(get_logical_path("test.pfw.gz"), 0); - - idx_db.begin_transaction(); - - idx_db.insert_metadata_lines(fid, 0, "HH", {0, 3}); - idx_db.insert_metadata_lines(fid, 0, "FH", {1}); - idx_db.insert_metadata_lines(fid, 1, "HH", {0}); - - idx_db.commit_transaction(); + int fid; + { + auto writer = idx_db.begin_write(); + writer->init_schema(); + fid = writer->get_or_create_file_info( + get_logical_path("test.pfw.gz"), 0); + + writer->insert_metadata_lines(fid, 0, "HH", {0, 3}); + writer->insert_metadata_lines(fid, 0, "FH", {1}); + writer->insert_metadata_lines(fid, 1, "HH", {0}); + writer->commit(); + } auto all = idx_db.query_metadata_lines(fid); CHECK(all.size() == 3); @@ -104,23 +105,33 @@ TEST_SUITE("ManifestQueries") { std::string index_path = test_dir + "/test.pfw.gz.idx"; IndexDatabase idx_db(index_path); - idx_db.init_base_schema(); - idx_db.init_manifest_schema(); - int fid = - idx_db.get_or_create_file_info(get_logical_path("test.pfw.gz"), 0); - - idx_db.begin_transaction(); - idx_db.insert_event_range(fid, 0, "POSIX", "read", {0, 1}); - idx_db.insert_metadata_lines(fid, 0, "HH", {2}); - idx_db.commit_transaction(); + int fid; + { + auto writer = idx_db.begin_write(); + writer->init_schema(); + fid = writer->get_or_create_file_info( + get_logical_path("test.pfw.gz"), 0); + + writer->insert_event_range(fid, 0, "POSIX", "read", {0, 1}); + writer->insert_metadata_lines(fid, 0, "HH", {2}); + writer->commit(); + } CHECK(idx_db.query_event_ranges(fid).size() == 1); CHECK(idx_db.query_metadata_lines(fid).size() == 1); - idx_db.delete_event_ranges(fid); + { + auto writer = idx_db.begin_write(); + writer->delete_event_ranges(fid); + writer->commit(); + } CHECK(idx_db.query_event_ranges(fid).empty()); - idx_db.delete_metadata_lines(fid); + { + auto writer = idx_db.begin_write(); + writer->delete_metadata_lines(fid); + writer->commit(); + } CHECK(idx_db.query_metadata_lines(fid).empty()); fs::remove_all(test_dir); diff --git a/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp b/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp index f4efdc68..f0a7c48a 100644 --- a/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp +++ b/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp @@ -39,6 +39,27 @@ using dftracer::utils::utilities::indexer::IndexBuilderUtility; using dftracer::utils::utilities::indexer::ProvenanceDatabase; namespace tags = dftracer::utils::utilities::tags; +static ExtractionPlan run_planner(const ReorganizationPlannerInput& input) { + Runtime rt(4); + ExtractionPlan result; + auto* result_ptr = &result; + + auto task = run_coro_scope( + rt.executor(), + [input, result_ptr](CoroScope& scope) -> coro::CoroTask { + auto planner = std::make_shared(); + UtilityExecutor + exec(planner, BehaviorChain{}); + *result_ptr = co_await exec.execute_with_context(scope, input); + }); + + rt.submit(std::move(task), "run_planner").wait(); + rt.shutdown(); + return result; +} + // Test trace layout: // Line 0: HH metadata // Line 1: FH metadata @@ -103,8 +124,7 @@ static void build_idx(const std::string& trace_file, indexer::IndexBuildResult>{}); auto config = IndexBuildConfig::for_file(trace_file) .with_index_dir(index_dir) - .with_manifest(true) - .with_index_threshold(0); + .with_manifest(true); *result_ptr = co_await exec.execute_with_context(scope, config); }); @@ -260,8 +280,6 @@ static void write_group_provenance( int fid = pdb.get_or_create_file_info(gz_path, 0); REQUIRE(fid >= 0); - pdb.begin_transaction(); - pdb.insert_info(fid, "version", "1.0"); pdb.insert_info(fid, "tool", "dftracer_organize"); pdb.insert_group(fid, g.name, g.query); @@ -285,14 +303,13 @@ static void write_group_provenance( for (const auto& [src_idx, ckpts] : segment_events) { for (const auto& [ckpt, count] : ckpts) { pdb.insert_segment(fid, static_cast(src_idx), - static_cast(ckpt), output_line, + static_cast(ckpt), /*seq=*/0, + output_line, output_line + static_cast(count), static_cast(count)); output_line += static_cast(count); } } - - pdb.commit_transaction(); } } @@ -312,14 +329,13 @@ TEST_SUITE("ReconstructIntegration") { build_idx(trace_file, input_dir); // Step 2: Plan reorganization - ReorganizationPlannerUtility planner; ReorganizationPlannerInput planner_input; planner_input.source_files = {trace_file}; planner_input.groups = {{"io", R"(cat == "POSIX")"}, {"compute", R"(cat == "APP")"}}; planner_input.index_dir = input_dir; - auto plan = planner.process(planner_input).get(); + auto plan = run_planner(planner_input); REQUIRE(plan.tasks.size() > 0); // Step 3: Execute extraction @@ -518,14 +534,13 @@ TEST_SUITE("ReconstructIntegration") { std::string trace_file = create_test_trace(input_dir); build_idx(trace_file, input_dir); - ReorganizationPlannerUtility planner; ReorganizationPlannerInput planner_input; planner_input.source_files = {trace_file}; planner_input.groups = {{"io", R"(cat == "POSIX")"}, {"compute", R"(cat == "APP")"}}; planner_input.index_dir = input_dir; - auto plan = planner.process(planner_input).get(); + auto plan = run_planner(planner_input); REQUIRE(plan.tasks.size() > 0); std::map group_files; diff --git a/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp b/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp index d0470119..72ecb760 100644 --- a/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp +++ b/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp @@ -50,8 +50,6 @@ TEST_SUITE("ReconstructionPlanner") { pdb.init_schema(); int fid = pdb.get_or_create_file_info(reorg_file, 0); - pdb.begin_transaction(); - // Provenance info pdb.insert_info(fid, "version", "1.0"); pdb.insert_info(fid, "tool", "dftracer_organize"); @@ -63,11 +61,9 @@ TEST_SUITE("ReconstructionPlanner") { pdb.insert_source(fid, 0, "/original/trace.pfw.gz", 3, "abc123"); // Provenance segments (3 checkpoints) - pdb.insert_segment(fid, 0, 0, 0, 100, 100); - pdb.insert_segment(fid, 0, 1, 100, 250, 150); - pdb.insert_segment(fid, 0, 2, 250, 400, 150); - - pdb.commit_transaction(); + pdb.insert_segment(fid, 0, 0, 0, 0, 100, 100); + pdb.insert_segment(fid, 0, 1, 0, 100, 250, 150); + pdb.insert_segment(fid, 0, 2, 0, 250, 400, 150); } // Run planner @@ -134,17 +130,14 @@ TEST_SUITE("ReconstructionPlanner") { pdb.init_schema(); int fid = pdb.get_or_create_file_info(io_file, 0); - pdb.begin_transaction(); pdb.insert_info(fid, "version", "1.0"); pdb.insert_info(fid, "tool", "dftracer_organize"); pdb.insert_group(fid, "io", "cat=POSIX"); pdb.insert_source(fid, 0, "/original/trace.pfw.gz", 2, "hash1"); // Segments for checkpoints 0 and 1 - pdb.insert_segment(fid, 0, 0, 0, 50, 50); - pdb.insert_segment(fid, 0, 1, 50, 120, 70); - - pdb.commit_transaction(); + pdb.insert_segment(fid, 0, 0, 0, 0, 50, 50); + pdb.insert_segment(fid, 0, 1, 0, 50, 120, 70); } // Create .pidx for compute.pfw.gz @@ -155,17 +148,14 @@ TEST_SUITE("ReconstructionPlanner") { pdb.init_schema(); int fid = pdb.get_or_create_file_info(compute_file, 0); - pdb.begin_transaction(); pdb.insert_info(fid, "version", "1.0"); pdb.insert_info(fid, "tool", "dftracer_organize"); pdb.insert_group(fid, "compute", "cat=APP"); pdb.insert_source(fid, 0, "/original/trace.pfw.gz", 2, "hash1"); // Segments for checkpoints 0 and 1 - pdb.insert_segment(fid, 0, 0, 0, 30, 30); - pdb.insert_segment(fid, 0, 1, 30, 80, 50); - - pdb.commit_transaction(); + pdb.insert_segment(fid, 0, 0, 0, 0, 30, 30); + pdb.insert_segment(fid, 0, 1, 0, 30, 80, 50); } // Run planner with both files diff --git a/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp b/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp index d566ffa6..a06d1001 100644 --- a/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp +++ b/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp @@ -27,6 +27,27 @@ using dftracer::utils::utilities::indexer::IndexBuilderUtility; using dftracer::utils::utilities::indexer::ProvenanceDatabase; namespace tags = dftracer::utils::utilities::tags; +static ExtractionPlan run_planner(const ReorganizationPlannerInput& input) { + Runtime rt(4); + ExtractionPlan result; + auto* result_ptr = &result; + + auto task = run_coro_scope( + rt.executor(), + [input, result_ptr](CoroScope& scope) -> coro::CoroTask { + auto planner = std::make_shared(); + UtilityExecutor + exec(planner, BehaviorChain{}); + *result_ptr = co_await exec.execute_with_context(scope, input); + }); + + rt.submit(std::move(task), "run_planner").wait(); + rt.shutdown(); + return result; +} + // Create a test trace with known events: // Line 0: HH metadata // Line 1: FH metadata @@ -77,8 +98,7 @@ static void build_idx(const std::string& trace_file, indexer::IndexBuildResult>{}); auto config = IndexBuildConfig::for_file(trace_file) .with_index_dir(index_dir) - .with_manifest(true) - .with_index_threshold(0); + .with_manifest(true); *result_ptr = co_await exec.execute_with_context(scope, config); }); @@ -133,13 +153,12 @@ TEST_SUITE("ReorganizationPlanner") { std::string trace_file = create_planner_test_trace(test_dir); build_idx(trace_file, test_dir); - ReorganizationPlannerUtility planner; ReorganizationPlannerInput input; input.source_files = {trace_file}; input.groups = {{"io", R"(cat == "POSIX")"}}; input.index_dir = test_dir; - auto plan = planner.process(input).get(); + auto plan = run_planner(input); // Should have 2 groups: "io" + auto-created "remainder" CHECK(plan.groups.size() == 2); @@ -197,14 +216,13 @@ TEST_SUITE("ReorganizationPlanner") { std::string trace_file = create_planner_test_trace(test_dir); build_idx(trace_file, test_dir); - ReorganizationPlannerUtility planner; ReorganizationPlannerInput input; input.source_files = {trace_file}; input.groups = {{"io", R"(cat == "POSIX")"}, {"compute", R"(cat == "APP")"}}; input.index_dir = test_dir; - auto plan = planner.process(input).get(); + auto plan = run_planner(input); CHECK(plan.groups.size() == 3); @@ -235,14 +253,13 @@ TEST_SUITE("ReorganizationPlanner") { std::string trace_file = create_planner_test_trace(test_dir); build_idx(trace_file, test_dir); - ReorganizationPlannerUtility planner; ReorganizationPlannerInput input; input.source_files = {trace_file}; input.groups = {{"io", R"(cat == "POSIX")"}, {"compute", R"(cat == "APP")"}}; input.index_dir = test_dir; - auto plan = planner.process(input).get(); + auto plan = run_planner(input); for (const auto& t : plan.tasks) { if (t.target_group == "remainder") { @@ -271,16 +288,12 @@ TEST_SUITE("ReorganizationPlanner") { pdb.init_schema(); int fid = pdb.get_or_create_file_info("test.pfw.gz", 0); - pdb.begin_transaction(); - pdb.insert_info(fid, "version", "1.0"); pdb.insert_info(fid, "created_at", "2026-02-17"); pdb.insert_source(fid, 0, "/data/trace.pfw.gz", 9, "abc123"); pdb.insert_group(fid, "io", R"(cat == "POSIX")"); - pdb.insert_segment(fid, 0, 0, 0, 100, 50); - pdb.insert_segment(fid, 0, 1, 100, 200, 45); - - pdb.commit_transaction(); + pdb.insert_segment(fid, 0, 0, 0, 0, 100, 50); + pdb.insert_segment(fid, 0, 1, 0, 100, 200, 45); CHECK(pdb.query_info(fid, "version") == "1.0"); CHECK(pdb.query_info(fid, "created_at") == "2026-02-17"); diff --git a/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp b/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp index 29c8e6ee..eb4d6a67 100644 --- a/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp +++ b/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp @@ -36,6 +36,27 @@ using dftracer::utils::utilities::indexer::IndexBuilderUtility; using dftracer::utils::utilities::indexer::ProvenanceDatabase; namespace tags = dftracer::utils::utilities::tags; +static ExtractionPlan run_planner(const ReorganizationPlannerInput& input) { + Runtime rt(4); + ExtractionPlan result; + auto* result_ptr = &result; + + auto task = run_coro_scope( + rt.executor(), + [input, result_ptr](CoroScope& scope) -> coro::CoroTask { + auto planner = std::make_shared(); + UtilityExecutor + exec(planner, BehaviorChain{}); + *result_ptr = co_await exec.execute_with_context(scope, input); + }); + + rt.submit(std::move(task), "run_planner").wait(); + rt.shutdown(); + return result; +} + // Test trace layout: // Line 0: HH metadata // Line 1: FH metadata @@ -100,8 +121,7 @@ static void build_idx_for_file(const std::string& trace_file, indexer::IndexBuildResult>{}); auto config = IndexBuildConfig::for_file(trace_file) .with_index_dir(index_dir) - .with_manifest(true) - .with_index_threshold(0); + .with_manifest(true); *result_ptr = co_await exec.execute_with_context(scope, config); }); @@ -231,14 +251,13 @@ TEST_SUITE("ReorganizeIntegration") { build_idx_for_file(trace_file, input_dir); // Step 2: Plan extraction - ReorganizationPlannerUtility planner; ReorganizationPlannerInput planner_input; planner_input.source_files = {trace_file}; planner_input.groups = {{"io", R"(cat == "POSIX")"}, {"compute", R"(cat == "APP")"}}; planner_input.index_dir = input_dir; - auto plan = planner.process(planner_input).get(); + auto plan = run_planner(planner_input); REQUIRE(plan.tasks.size() > 0); // Step 3: Execute extraction @@ -318,13 +337,12 @@ TEST_SUITE("ReorganizeIntegration") { build_idx_for_file(trace_file, input_dir); // Plan for io group only - ReorganizationPlannerUtility planner; ReorganizationPlannerInput planner_input; planner_input.source_files = {trace_file}; planner_input.groups = {{"io", R"(cat == "POSIX")"}}; planner_input.index_dir = input_dir; - auto plan = planner.process(planner_input).get(); + auto plan = run_planner(planner_input); // Extract io group std::string io_pfw = output_dir + "/io.pfw"; @@ -379,8 +397,7 @@ TEST_SUITE("ReorganizeIntegration") { indexer::IndexBuildResult>{}); auto config = IndexBuildConfig::for_file(io_gz) .with_index_dir(output_dir) - .with_manifest(true) - .with_index_threshold(0); + .with_manifest(true); *idx_result_ptr = co_await exec.execute_with_context(scope, config); }); @@ -404,13 +421,11 @@ TEST_SUITE("ReorganizeIntegration") { int fid = pdb.get_or_create_file_info(io_gz, 0); REQUIRE(fid >= 0); - pdb.begin_transaction(); pdb.insert_info(fid, "version", "1.0"); pdb.insert_info(fid, "tool", "dftracer_organize"); pdb.insert_group(fid, "io", R"(cat == "POSIX")"); pdb.insert_source(fid, 0, trace_file, 1, ""); - pdb.insert_segment(fid, 0, 0, 0, 5, 3); - pdb.commit_transaction(); + pdb.insert_segment(fid, 0, 0, 0, 0, 5, 3); } // Verify provenance diff --git a/tests/utilities/composites/dft/statistics/test_detailed_statistics.cpp b/tests/utilities/composites/dft/statistics/test_detailed_statistics.cpp index 7c3d0318..3995c5ee 100644 --- a/tests/utilities/composites/dft/statistics/test_detailed_statistics.cpp +++ b/tests/utilities/composites/dft/statistics/test_detailed_statistics.cpp @@ -1,7 +1,7 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include -#include +#include #include #include @@ -192,33 +192,36 @@ TEST_SUITE("DetailedStatistics") { std::string json = stats.to_json(); - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - REQUIRE(doc != nullptr); - - yyjson_val* root = yyjson_doc_get_root(doc); - REQUIRE(yyjson_is_obj(root)); - - CHECK(yyjson_get_uint(yyjson_obj_get(root, "events_scanned")) == 2); - CHECK(yyjson_get_uint(yyjson_obj_get(root, "chunks_scanned")) == 1); - CHECK(yyjson_get_uint(yyjson_obj_get(root, "chunks_skipped")) == 3); - - yyjson_val* dur = yyjson_obj_get(root, "duration"); - REQUIRE(yyjson_is_obj(dur)); - CHECK(yyjson_get_uint(yyjson_obj_get(dur, "count")) == 2); - - yyjson_val* gd = yyjson_obj_get(root, "grouped_duration"); - REQUIRE(yyjson_is_obj(gd)); - yyjson_val* gd_read = yyjson_obj_get(gd, "read"); - REQUIRE(yyjson_is_obj(gd_read)); - CHECK(yyjson_get_uint(yyjson_obj_get(gd_read, "count")) == 1); - - yyjson_val* gio = yyjson_obj_get(root, "grouped_io"); - REQUIRE(yyjson_is_obj(gio)); - yyjson_val* gio_read = yyjson_obj_get(gio, "read"); - REQUIRE(yyjson_is_obj(gio_read)); - - yyjson_doc_free(doc); + simdjson::dom::parser parser; + auto result = parser.parse(json); + REQUIRE(!result.error()); + + auto root = result.value_unsafe(); + REQUIRE(root.is_object()); + + CHECK(root["events_scanned"].get_uint64().value() == 2); + CHECK(root["chunks_scanned"].get_uint64().value() == 1); + CHECK(root["chunks_skipped"].get_uint64().value() == 3); + + auto dur = root["duration"]; + REQUIRE(!dur.error()); + REQUIRE(dur.is_object()); + CHECK(dur["count"].get_uint64().value() == 2); + + auto gd = root["grouped_duration"]; + REQUIRE(!gd.error()); + REQUIRE(gd.is_object()); + auto gd_read = gd["read"]; + REQUIRE(!gd_read.error()); + REQUIRE(gd_read.is_object()); + CHECK(gd_read["count"].get_uint64().value() == 1); + + auto gio = root["grouped_io"]; + REQUIRE(!gio.error()); + REQUIRE(gio.is_object()); + auto gio_read = gio["read"]; + REQUIRE(!gio_read.error()); + REQUIRE(gio_read.is_object()); } TEST_CASE("to_json - no grouped when empty") { @@ -227,15 +230,13 @@ TEST_SUITE("DetailedStatistics") { std::string json = stats.to_json(); - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - REQUIRE(doc != nullptr); + simdjson::dom::parser parser; + auto result = parser.parse(json); + REQUIRE(!result.error()); - yyjson_val* root = yyjson_doc_get_root(doc); - CHECK(yyjson_obj_get(root, "grouped_duration") == nullptr); - CHECK(yyjson_obj_get(root, "grouped_io") == nullptr); - - yyjson_doc_free(doc); + auto root = result.value_unsafe(); + CHECK(root["grouped_duration"].error()); + CHECK(root["grouped_io"].error()); } TEST_CASE("to_json - global duration always present") { @@ -243,15 +244,14 @@ TEST_SUITE("DetailedStatistics") { // Even with no events, duration section should be present std::string json = stats.to_json(); - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - REQUIRE(doc != nullptr); - - yyjson_val* root = yyjson_doc_get_root(doc); - yyjson_val* dur = yyjson_obj_get(root, "duration"); - REQUIRE(yyjson_is_obj(dur)); - CHECK(yyjson_get_uint(yyjson_obj_get(dur, "count")) == 0); + simdjson::dom::parser parser; + auto result = parser.parse(json); + REQUIRE(!result.error()); - yyjson_doc_free(doc); + auto root = result.value_unsafe(); + auto dur = root["duration"]; + REQUIRE(!dur.error()); + REQUIRE(dur.is_object()); + CHECK(dur["count"].get_uint64().value() == 0); } } diff --git a/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp b/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp index 205c85fd..f4e9e1ae 100644 --- a/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp +++ b/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -16,13 +17,14 @@ using namespace dftracer::utils::utilities::composites::dft::internal; using namespace dftracer::utils::utilities::composites::dft::indexing; using namespace dftracer::utils::utilities::composites::dft::statistics; using dftracer::utils::utilities::indexer::IndexDatabase; +using dftracer::utils::utilities::indexer::IndexDatabaseWriterContext; using dftracer::utils::utilities::indexer::internal::get_logical_path; static void write_chunk( - IndexDatabase& db, int fid, std::uint64_t checkpoint_idx, + IndexDatabaseWriterContext& writer, int fid, std::uint64_t checkpoint_idx, ChunkStatistics& stats, const std::vector>& dim_values) { - db.insert_chunk_statistics(fid, checkpoint_idx, stats); + writer.insert_chunk_statistics(fid, checkpoint_idx, stats); std::unordered_map dim_stats; for (const auto& [dim, val] : dim_values) { @@ -32,27 +34,24 @@ static void write_chunk( ds.observe(val); } for (const auto& [dim, ds] : dim_stats) { - db.insert_chunk_dimension_stats(fid, checkpoint_idx, ds); + writer.insert_chunk_dimension_stats(fid, checkpoint_idx, ds); } } static void populate_test_db(const std::string& db_root, const std::string& file_path) { IndexDatabase idx_db(db_root); - idx_db.init_base_schema(); - idx_db.init_bloom_schema(); + auto writer = idx_db.begin_write(); + writer->init_schema(); int fid = - idx_db.get_or_create_file_info(get_logical_path(file_path), 12345); + writer->get_or_create_file_info(get_logical_path(file_path), 12345); - idx_db.begin_transaction(); - - // Chunk 0: 2 events { ChunkStatistics stats; stats.update_from_event("read", "POSIX", 1, 1, 1000, 100); stats.update_from_event("write", "POSIX", 1, 2, 2000, 200); - write_chunk(idx_db, fid, 0, stats, + write_chunk(*writer, fid, 0, stats, {{"cat", "POSIX"}, {"cat", "POSIX"}, {"name", "read"}, @@ -61,20 +60,18 @@ static void populate_test_db(const std::string& db_root, {"pid_tid", "1:2"}}); } - // Chunk 1: 1 event { ChunkStatistics stats; stats.update_from_event("open", "storage", 2, 1, 5000, 50); - write_chunk(idx_db, fid, 1, stats, + write_chunk(*writer, fid, 1, stats, {{"cat", "storage"}, {"name", "open"}, {"pid_tid", "2:1"}}); } - // Chunk 2: 2 events { ChunkStatistics stats; stats.update_from_event("read", "POSIX", 1, 1, 8000, 300); stats.update_from_event("stat", "POSIX", 3, 1, 9000, 10); - write_chunk(idx_db, fid, 2, stats, + write_chunk(*writer, fid, 2, stats, {{"cat", "POSIX"}, {"cat", "POSIX"}, {"name", "read"}, @@ -83,7 +80,7 @@ static void populate_test_db(const std::string& db_root, {"pid_tid", "3:1"}}); } - idx_db.commit_transaction(); + writer->commit(); } TEST_SUITE("StatisticsAggregatorUtility") { @@ -179,11 +176,13 @@ TEST_SUITE("StatisticsAggregatorUtility") { determine_index_path(test_dir + "/test.pfw.gz", ""); std::string file_path = "/fake/test.pfw.gz"; - // Create idx with file_info but no chunk_statistics IndexDatabase idx_db(db_root); - idx_db.init_base_schema(); - idx_db.init_bloom_schema(); - idx_db.get_or_create_file_info(get_logical_path(file_path), 12345); + { + auto writer = idx_db.begin_write(); + writer->init_schema(); + writer->get_or_create_file_info(get_logical_path(file_path), 12345); + writer->commit(); + } StatisticsAggregatorUtility aggregator; StatisticsAggregatorInput input; @@ -211,32 +210,31 @@ TEST_SUITE("StatisticsAggregatorUtility") { std::string file_path = "/fake/test.pfw.gz"; IndexDatabase idx_db(db_root); - idx_db.init_base_schema(); - idx_db.init_bloom_schema(); - int fid = - idx_db.get_or_create_file_info(get_logical_path(file_path), 12345); - - idx_db.begin_transaction(); - - // Chunk 0: durations 10, 20 + int fid; { - ChunkStatistics stats; - stats.update_from_event("op", "cat", 1, 1, 1000, 10); - stats.update_from_event("op", "cat", 1, 1, 2000, 20); - idx_db.insert_chunk_statistics(fid, 0, stats); + auto writer = idx_db.begin_write(); + writer->init_schema(); + fid = writer->get_or_create_file_info(get_logical_path(file_path), + 12345); + + { + ChunkStatistics stats; + stats.update_from_event("op", "cat", 1, 1, 1000, 10); + stats.update_from_event("op", "cat", 1, 1, 2000, 20); + writer->insert_chunk_statistics(fid, 0, stats); + } + + { + ChunkStatistics stats; + stats.update_from_event("op", "cat", 1, 1, 3000, 30); + stats.update_from_event("op", "cat", 1, 1, 4000, 40); + stats.update_from_event("op", "cat", 1, 1, 5000, 50); + writer->insert_chunk_statistics(fid, 1, stats); + } + + writer->commit(); } - // Chunk 1: durations 30, 40, 50 - { - ChunkStatistics stats; - stats.update_from_event("op", "cat", 1, 1, 3000, 30); - stats.update_from_event("op", "cat", 1, 1, 4000, 40); - stats.update_from_event("op", "cat", 1, 1, 5000, 50); - idx_db.insert_chunk_statistics(fid, 1, stats); - } - - idx_db.commit_transaction(); - StatisticsAggregatorUtility aggregator; StatisticsAggregatorInput input; input.file_path = file_path; diff --git a/tests/utilities/composites/dft/statistics/test_statistics_query.cpp b/tests/utilities/composites/dft/statistics/test_statistics_query.cpp index 125fc6a2..18f3aa36 100644 --- a/tests/utilities/composites/dft/statistics/test_statistics_query.cpp +++ b/tests/utilities/composites/dft/statistics/test_statistics_query.cpp @@ -1,7 +1,7 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include -#include +#include #include @@ -189,18 +189,16 @@ TEST_SUITE("StatisticsQueryUtility") { auto output = query.process(input).get(); std::string json = output.to_json(); - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - REQUIRE(doc != nullptr); + simdjson::dom::parser parser; + auto result = parser.parse(json); + REQUIRE(!result.error()); - yyjson_val* root = yyjson_doc_get_root(doc); - REQUIRE(yyjson_is_obj(root)); + auto root = result.value_unsafe(); + REQUIRE(root.is_object()); // query_type field should always be present - CHECK(yyjson_obj_get(root, "query_type") != nullptr); - CHECK(yyjson_obj_get(root, "total_events") != nullptr); - - yyjson_doc_free(doc); + CHECK(!root["query_type"].error()); + CHECK(!root["total_events"].error()); } } } diff --git a/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp b/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp index 6af11bcb..2f3f0424 100644 --- a/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp +++ b/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp @@ -1,7 +1,7 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include -#include +#include #include #include @@ -63,35 +63,36 @@ TEST_SUITE("TraceStatistics") { std::string json = ts.to_json(); // Parse and validate the JSON - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - REQUIRE(doc != nullptr); + simdjson::dom::parser parser; + auto result = parser.parse(json); + REQUIRE(!result.error()); - yyjson_val* root = yyjson_doc_get_root(doc); - REQUIRE(yyjson_is_obj(root)); + auto root = result.value_unsafe(); + REQUIRE(root.is_object()); - CHECK(std::string(yyjson_get_str(yyjson_obj_get(root, "file_path"))) == + CHECK(std::string(root["file_path"].get_string().value()) == "/test/file.pfw.gz"); - CHECK(yyjson_get_bool(yyjson_obj_get(root, "success")) == true); - CHECK(yyjson_get_uint(yyjson_obj_get(root, "total_events")) == 2); - CHECK(yyjson_get_uint(yyjson_obj_get(root, "num_chunks")) == 2); - CHECK(yyjson_get_uint(yyjson_obj_get(root, "num_categories")) == 2); - CHECK(yyjson_get_uint(yyjson_obj_get(root, "num_unique_names")) == 2); + CHECK(root["success"].get_bool().value() == true); + CHECK(root["total_events"].get_uint64().value() == 2); + CHECK(root["num_chunks"].get_uint64().value() == 2); + CHECK(root["num_categories"].get_uint64().value() == 2); + CHECK(root["num_unique_names"].get_uint64().value() == 2); // Check time_range object exists - yyjson_val* time_range = yyjson_obj_get(root, "time_range"); - REQUIRE(yyjson_is_obj(time_range)); + auto time_range = root["time_range"]; + REQUIRE(!time_range.error()); + REQUIRE(time_range.is_object()); // Check duration object exists - yyjson_val* duration = yyjson_obj_get(root, "duration"); - REQUIRE(yyjson_is_obj(duration)); - CHECK(yyjson_get_uint(yyjson_obj_get(duration, "count")) == 2); + auto duration = root["duration"]; + REQUIRE(!duration.error()); + REQUIRE(duration.is_object()); + CHECK(duration["count"].get_uint64().value() == 2); // Check category_counts object exists - yyjson_val* cats = yyjson_obj_get(root, "category_counts"); - REQUIRE(yyjson_is_obj(cats)); - - yyjson_doc_free(doc); + auto cats = root["category_counts"]; + REQUIRE(!cats.error()); + REQUIRE(cats.is_object()); } TEST_CASE("TraceStatistics - to_json with error") { @@ -103,15 +104,13 @@ TEST_SUITE("TraceStatistics") { std::string json = ts.to_json(); - yyjson_doc* doc = - yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG); - REQUIRE(doc != nullptr); + simdjson::dom::parser parser; + auto result = parser.parse(json); + REQUIRE(!result.error()); - yyjson_val* root = yyjson_doc_get_root(doc); - CHECK(yyjson_get_bool(yyjson_obj_get(root, "success")) == false); - CHECK(std::string(yyjson_get_str(yyjson_obj_get(root, "error"))) == + auto root = result.value_unsafe(); + CHECK(root["success"].get_bool().value() == false); + CHECK(std::string(root["error"].get_string().value()) == "File not found"); - - yyjson_doc_free(doc); } } diff --git a/tests/utilities/composites/dft/test_index_builder.cpp b/tests/utilities/composites/dft/test_index_builder.cpp index 19ae5f69..a927eb23 100644 --- a/tests/utilities/composites/dft/test_index_builder.cpp +++ b/tests/utilities/composites/dft/test_index_builder.cpp @@ -52,8 +52,7 @@ TEST_SUITE("IndexBuilder") { auto input = IndexBuildConfig::for_file(gz_file) .with_index_dir("") - .with_checkpoint_size(10) - .with_index_threshold(0); + .with_checkpoint_size(10); auto output = run_builder(input); @@ -68,9 +67,8 @@ TEST_SUITE("IndexBuilder") { SUBCASE("Use existing index without force rebuild") { std::string gz_file = env.create_dft_test_gzip_file(20); - auto input1 = IndexBuildConfig::for_file(gz_file) - .with_index_dir("") - .with_index_threshold(0); + auto input1 = + IndexBuildConfig::for_file(gz_file).with_index_dir(""); auto output1 = run_builder(input1); CHECK(output1.success == true); @@ -89,8 +87,7 @@ TEST_SUITE("IndexBuilder") { auto input = IndexBuildConfig::for_file(gz_file) .with_index_dir("") - .with_force_rebuild(true) - .with_index_threshold(0); + .with_force_rebuild(true); auto output1 = run_builder(input); CHECK(output1.success == true); diff --git a/tests/utilities/composites/dft/views/test_view_builder.cpp b/tests/utilities/composites/dft/views/test_view_builder.cpp index ca2c86f3..92e9b814 100644 --- a/tests/utilities/composites/dft/views/test_view_builder.cpp +++ b/tests/utilities/composites/dft/views/test_view_builder.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -26,13 +27,11 @@ using dftracer::utils::utilities::indexer::internal::get_logical_path; static void populate_test_idx(const std::string& index_path, const std::string& file_path) { IndexDatabase idx_db(index_path); - idx_db.init_base_schema(); - idx_db.init_bloom_schema(); + auto writer = idx_db.begin_write(); + writer->init_schema(); int fid = - idx_db.get_or_create_file_info(get_logical_path(file_path), 40000); - - idx_db.begin_transaction(); + writer->get_or_create_file_info(get_logical_path(file_path), 40000); struct ChunkDims { std::vector names; @@ -46,7 +45,6 @@ static void populate_test_idx(const std::string& index_path, {{"forward"}, {"compute", "ai_framework"}}, }; - // File-level blooms (union of all chunks) BloomFilter file_name_bloom(100, 0.01); BloomFilter file_cat_bloom(100, 0.01); @@ -57,7 +55,7 @@ static void populate_test_idx(const std::string& index_path, file_name_bloom.add(n); } auto name_blob = name_bloom.serialize(); - idx_db.insert_chunk_bloom_filter( + writer->insert_chunk_bloom_filter( fid, static_cast(ckpt), "name", name_blob.data(), static_cast(name_blob.size()), name_bloom.num_entries()); @@ -67,26 +65,24 @@ static void populate_test_idx(const std::string& index_path, file_cat_bloom.add(c); } auto cat_blob = cat_bloom.serialize(); - idx_db.insert_chunk_bloom_filter( + writer->insert_chunk_bloom_filter( fid, static_cast(ckpt), "cat", cat_blob.data(), static_cast(cat_blob.size()), cat_bloom.num_entries()); } - // File-level bloom filters auto name_blob = file_name_bloom.serialize(); - idx_db.insert_file_bloom_filter(fid, "name", name_blob.data(), - static_cast(name_blob.size()), - file_name_bloom.num_entries()); + writer->insert_file_bloom_filter(fid, "name", name_blob.data(), + static_cast(name_blob.size()), + file_name_bloom.num_entries()); auto cat_blob = file_cat_bloom.serialize(); - idx_db.insert_file_bloom_filter(fid, "cat", cat_blob.data(), - static_cast(cat_blob.size()), - file_cat_bloom.num_entries()); - - idx_db.insert_index_dimension(fid, "name"); - idx_db.insert_index_dimension(fid, "cat"); + writer->insert_file_bloom_filter(fid, "cat", cat_blob.data(), + static_cast(cat_blob.size()), + file_cat_bloom.num_entries()); - idx_db.commit_transaction(); + writer->insert_index_dimension(fid, "name"); + writer->insert_index_dimension(fid, "cat"); + writer->commit(); } TEST_SUITE("ViewBuilderUtility") { @@ -301,27 +297,27 @@ TEST_SUITE("ViewBuilderUtility") { std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - // Create idx with fhash dimension IndexDatabase idx_db(index_path); - idx_db.init_base_schema(); - idx_db.init_bloom_schema(); - int fid = - idx_db.get_or_create_file_info(get_logical_path(file_path), 10000); - idx_db.begin_transaction(); - - BloomFilter fhash_bloom(100, 0.01); - fhash_bloom.add("hash123"); - auto blob = fhash_bloom.serialize(); - - idx_db.insert_file_bloom_filter(fid, "fhash", blob.data(), - static_cast(blob.size()), - fhash_bloom.num_entries()); - idx_db.insert_chunk_bloom_filter(fid, 0, "fhash", blob.data(), - static_cast(blob.size()), - fhash_bloom.num_entries()); - idx_db.insert_index_dimension(fid, "fhash"); - idx_db.insert_hash_resolution(fid, "fhash", "hash123", "/data/file.h5"); - idx_db.commit_transaction(); + { + auto writer = idx_db.begin_write(); + writer->init_schema(); + int fid = writer->get_or_create_file_info( + get_logical_path(file_path), 10000); + + BloomFilter fhash_bloom(100, 0.01); + fhash_bloom.add("hash123"); + auto blob = fhash_bloom.serialize(); + + writer->insert_file_bloom_filter(fid, "fhash", blob.data(), + static_cast(blob.size()), + fhash_bloom.num_entries()); + writer->insert_chunk_bloom_filter(fid, 0, "fhash", blob.data(), + static_cast(blob.size()), + fhash_bloom.num_entries()); + writer->insert_index_dimension(fid, "fhash"); + writer->insert_hash_table_entry(0, "hash123", "/data/file.h5"); + writer->commit(); + } // Use "file" alias which should resolve to "fhash" ViewDefinition view; diff --git a/tests/utilities/composites/test_file_merger.cpp b/tests/utilities/composites/test_file_merger.cpp index aa2b46a2..28e5f869 100644 --- a/tests/utilities/composites/test_file_merger.cpp +++ b/tests/utilities/composites/test_file_merger.cpp @@ -2,9 +2,9 @@ #include #include #include +#include #include #include -#include #include #include @@ -156,13 +156,13 @@ TEST_SUITE("FileMerger") { } // Parse each JSON line - yyjson_doc *doc = yyjson_read(line.c_str(), line.size(), 0); - if (doc != nullptr) { - yyjson_val *root = yyjson_doc_get_root(doc); - if (yyjson_is_obj(root)) { + simdjson::dom::parser parser; + auto result = parser.parse(line); + if (!result.error()) { + auto root = result.value_unsafe(); + if (root.is_object()) { event_count++; } - yyjson_doc_free(doc); } } ifs.close(); diff --git a/tests/utilities/fileio/parallel/test_layout_sizing.cpp b/tests/utilities/fileio/parallel/test_layout_sizing.cpp new file mode 100644 index 00000000..5d96fd73 --- /dev/null +++ b/tests/utilities/fileio/parallel/test_layout_sizing.cpp @@ -0,0 +1,93 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include + +using dftracer::utils::utilities::fileio::parallel::compute_writer_sizing; +using dftracer::utils::utilities::fileio::parallel::FileLayout; +using dftracer::utils::utilities::fileio::parallel::FilesystemKind; +using dftracer::utils::utilities::fileio::parallel::LayoutInfo; + +namespace { +constexpr std::size_t MB = 1024 * 1024; +} + +TEST_CASE("compute_writer_sizing - local FS uses defaults") { + LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LOCAL, 0, 0}; + auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB); + CHECK(s.num_workers == 8); + CHECK(s.flush_threshold == 12 * MB); + CHECK(s.buffer_capacity == 16 * MB); +} + +TEST_CASE("compute_writer_sizing - NFS keeps defaults (no stripe info)") { + LayoutInfo info{FileLayout::SHARDED, FilesystemKind::NFS, 0, 0}; + auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB); + CHECK(s.num_workers == 8); + CHECK(s.flush_threshold == 12 * MB); + CHECK(s.buffer_capacity == 16 * MB); +} + +TEST_CASE("compute_writer_sizing - Lustre caps workers at stripe_count") { + LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 1 * MB, 4}; + auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB); + CHECK(s.num_workers == 4); + CHECK(s.flush_threshold == 12 * MB); // stripe 1MB < default 12MB + CHECK(s.buffer_capacity == 16 * MB); +} + +TEST_CASE("compute_writer_sizing - Lustre grows flush to stripe_size") { + LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 32 * MB, 2}; + auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB); + CHECK(s.num_workers == 2); + CHECK(s.flush_threshold == 32 * MB); + CHECK(s.buffer_capacity == 36 * MB); +} + +TEST_CASE("compute_writer_sizing - baseline smaller than stripe_count wins") { + LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 4 * MB, 16}; + auto s = compute_writer_sizing(info, 4, 12 * MB, 4 * MB); + CHECK(s.num_workers == 4); +} + +TEST_CASE("compute_writer_sizing - zero baseline coerced to one worker") { + LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LOCAL, 0, 0}; + auto s = compute_writer_sizing(info, 0, 12 * MB, 4 * MB); + CHECK(s.num_workers == 1); +} + +TEST_CASE( + "compute_writer_sizing - GPFS treated like Lustre when stripe given") { + LayoutInfo info{FileLayout::STRIPED, FilesystemKind::GPFS, 8 * MB, 3}; + auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB); + CHECK(s.num_workers == 3); + CHECK(s.flush_threshold == 12 * MB); +} + +TEST_CASE("compute_writer_sizing - padded layout clamps flush to stripe") { + LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 4 * MB, 4}; + auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB, + /*padded_layout=*/true); + // Padded layout does not cap workers by stripe_count; the packer + // serializes stripe assembly, so extra compression workers are useful. + CHECK(s.num_workers == 8); + CHECK(s.flush_threshold == 4 * MB); // clamped to stripe, not default + CHECK(s.buffer_capacity == 8 * MB); // flush + headroom +} + +TEST_CASE( + "compute_writer_sizing - padded keeps baseline regardless of stripe") { + LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 32 * MB, 8}; + auto s = compute_writer_sizing(info, 16, 12 * MB, 4 * MB, + /*padded_layout=*/true); + CHECK(s.num_workers == 16); // not capped by stripe_count + CHECK(s.flush_threshold == 32 * MB); + CHECK(s.buffer_capacity == 36 * MB); +} + +TEST_CASE("compute_writer_sizing - padded_layout without stripe is a no-op") { + LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LOCAL, 0, 0}; + auto s = compute_writer_sizing(info, 4, 12 * MB, 4 * MB, + /*padded_layout=*/true); + // No stripe known, fall back to default flush. + CHECK(s.flush_threshold == 12 * MB); +} diff --git a/tests/utilities/fileio/parallel/test_padded_striped_writer.cpp b/tests/utilities/fileio/parallel/test_padded_striped_writer.cpp new file mode 100644 index 00000000..4639b408 --- /dev/null +++ b/tests/utilities/fileio/parallel/test_padded_striped_writer.cpp @@ -0,0 +1,219 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace dftracer::utils; +using namespace dftracer::utils::coro; +using namespace dftracer::utils::utilities::fileio::parallel; + +namespace { + +constexpr std::size_t STRIPE = 1 * 1024 * 1024; // 1 MB stripe for the test + +std::string tmp_path(const char* name) { + return (fs::temp_directory_path() / name).string(); +} + +// Compress `data` into a standalone gzip member with zlib. +std::vector gzip_member(const std::string& data) { + uLongf bound = compressBound(data.size()) + 64; + std::vector out(bound); + z_stream s{}; + REQUIRE(deflateInit2(&s, Z_BEST_SPEED, Z_DEFLATED, 15 | 16, 8, + Z_DEFAULT_STRATEGY) == Z_OK); + s.next_in = reinterpret_cast(const_cast(data.data())); + s.avail_in = static_cast(data.size()); + s.next_out = out.data(); + s.avail_out = static_cast(out.size()); + REQUIRE(deflate(&s, Z_FINISH) == Z_STREAM_END); + out.resize(s.total_out); + deflateEnd(&s); + return out; +} + +ByteView as_bv(const std::vector& v) { + return ByteView(reinterpret_cast(v.data()), v.size()); +} + +// Decompress the whole file via gzread (spans concatenated members natively). +std::string gunzip_all(const std::string& path) { + gzFile g = gzopen(path.c_str(), "rb"); + REQUIRE(g != nullptr); + std::string out; + char buf[4096]; + int n; + while ((n = gzread(g, buf, sizeof(buf))) > 0) { + out.append(buf, n); + } + gzclose(g); + return out; +} + +} // namespace + +TEST_CASE("PaddedStripedWriter - gzip -t passes and payload round-trips") { + std::string path = tmp_path("test_padded_basic.json.gz"); + std::remove(path.c_str()); + + std::string hdr_payload = "HDR\n"; + std::string w0_payload = R"({"ev":0})" + "\n"; + std::string w1_payload = R"({"ev":1})" + "\n"; + std::string w2_payload = R"({"ev":2})" + "\n"; + std::string ftr_payload = "]\n"; + + auto hdr_mem = gzip_member(hdr_payload); + auto w0_mem = gzip_member(w0_payload); + auto w1_mem = gzip_member(w1_payload); + auto w2_mem = gzip_member(w2_payload); + auto ftr_mem = gzip_member(ftr_payload); + + Runtime runtime(2); + int result = -1; + runtime + .scope("padded_basic", + [&](CoroScope& s) -> CoroTask { + auto w = make_padded_striped_writer(STRIPE); + if (co_await w->open(path, 3, true, &s) != 0) { + result = 1; + co_return; + } + if (co_await w->write_header(as_bv(hdr_mem)) != 0) { + result = 2; + co_return; + } + if (co_await w->write_chunk(0, as_bv(w0_mem)) != 0) { + result = 3; + co_return; + } + if (co_await w->write_chunk(1, as_bv(w1_mem)) != 0) { + result = 4; + co_return; + } + if (co_await w->write_chunk(2, as_bv(w2_mem)) != 0) { + result = 5; + co_return; + } + if (co_await w->write_footer(as_bv(ftr_mem)) != 0) { + result = 6; + co_return; + } + if (co_await w->close() != 0) { + result = 7; + co_return; + } + result = 0; + }) + .get(); + CHECK(result == 0); + + // With coalescing, the three tiny worker members pack into a single + // stripe. File = [header stripe][one coalesced stripe][footer bytes]. + CHECK(fs::file_size(path) == 2 * STRIPE + ftr_mem.size()); + + auto decompressed = gunzip_all(path); + // Decompressed content = hdr + w0 + w1 + w2 + footer, with worker ordering + // determined by the atomic stripe allocation (sequential here since we + // serialized calls). + CHECK(decompressed == + hdr_payload + w0_payload + w1_payload + w2_payload + ftr_payload); + + fs::remove(path); +} + +TEST_CASE("PaddedStripedWriter - oversize chunk is rejected") { + std::string path = tmp_path("test_padded_oversize.gz"); + std::remove(path.c_str()); + + // Create a "payload" larger than one stripe (minus overhead). + std::vector huge(STRIPE, 0xAA); + + Runtime runtime(2); + int result = -1; + runtime + .scope("padded_oversize", + [&](CoroScope& s) -> CoroTask { + auto w = make_padded_striped_writer(STRIPE); + if (co_await w->open(path, 1, true, &s) != 0) { + result = 1; + co_return; + } + auto rc = co_await w->write_chunk(0, as_bv(huge)); + co_await w->close(); + result = rc == 0 ? 99 : 0; // expect failure + }) + .get(); + CHECK(result == 0); + std::remove(path.c_str()); +} + +TEST_CASE( + "PaddedStripedWriter - file-level gzip integrity (decompress twice)") { + std::string path = tmp_path("test_padded_integrity.json.gz"); + std::remove(path.c_str()); + + // Bigger payload so the test meaningfully exercises padding members. + std::string big; + big.reserve(200 * 1024); + for (int i = 0; i < 2000; ++i) { + char line[128]; + std::snprintf(line, sizeof(line), + R"({"id":%d,"name":"evt_%d"} )" + "\n", + i, i); + big += line; + } + auto mem = gzip_member(big); + REQUIRE(mem.size() + 25 < STRIPE); // fits in one stripe with padding + + Runtime runtime(2); + int result = -1; + runtime + .scope("padded_integrity", + [&](CoroScope& s) -> CoroTask { + auto w = make_padded_striped_writer(STRIPE); + if (co_await w->open(path, 1, true, &s) != 0) { + result = 1; + co_return; + } + // Two chunks of the same payload; packer coalesces both + // into a single stripe since they fit together. + if (co_await w->write_chunk(0, as_bv(mem)) != 0) { + result = 2; + co_return; + } + if (co_await w->write_chunk(0, as_bv(mem)) != 0) { + result = 3; + co_return; + } + if (co_await w->close() != 0) { + result = 4; + co_return; + } + result = 0; + }) + .get(); + CHECK(result == 0); + + // Two payloads totaling 2*mem.size() + 25 (pad overhead). Whether they + // fit in a single stripe or spill into two depends on mem size relative + // to STRIPE; we assert only that the file is a multiple of STRIPE. + CHECK(fs::file_size(path) % STRIPE == 0); + auto round = gunzip_all(path); + CHECK(round == big + big); + + fs::remove(path); +} diff --git a/tests/utilities/fileio/parallel/test_sharded_writer.cpp b/tests/utilities/fileio/parallel/test_sharded_writer.cpp new file mode 100644 index 00000000..d7f656d5 --- /dev/null +++ b/tests/utilities/fileio/parallel/test_sharded_writer.cpp @@ -0,0 +1,109 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace dftracer::utils; +using namespace dftracer::utils::coro; +using namespace dftracer::utils::utilities::fileio::parallel; + +namespace { + +std::string tmp_path(const char* name) { + return (fs::temp_directory_path() / name).string(); +} + +std::string read_all(const std::string& path) { + std::ifstream f(path, std::ios::binary); + std::stringstream ss; + ss << f.rdbuf(); + return ss.str(); +} + +ByteView sv(const std::string& s) { return ByteView(s.data(), s.size()); } + +void cleanup(const std::vector& paths) { + for (const auto& p : paths) std::remove(p.c_str()); +} + +} // namespace + +TEST_CASE("ShardedWriter - one shard per worker with header/footer placement") { + std::string base = tmp_path("test_sharded_basic"); + std::vector expected = {base + ".shard_0", base + ".shard_1", + base + ".shard_2"}; + cleanup(expected); + + Runtime runtime(2); + std::vector paths; + auto task = [&]() -> CoroTask { + auto w = make_sharded_writer(); + if (co_await w->open(base, 3, false, nullptr) != 0) co_return 1; + if (co_await w->write_header(sv("HDR\n")) != 0) co_return 2; + if (co_await w->write_chunk(0, sv("A\n")) != 0) co_return 3; + if (co_await w->write_chunk(1, sv("B\n")) != 0) co_return 4; + if (co_await w->write_chunk(2, sv("C\n")) != 0) co_return 5; + if (co_await w->write_footer(sv("END\n")) != 0) co_return 6; + paths = w->output_paths(); + if (co_await w->close() != 0) co_return 7; + co_return 0; + }; + CHECK(runtime.submit(task(), "sharded_basic").get() == 0); + + REQUIRE(paths.size() == 3); + CHECK(paths == expected); + + CHECK(read_all(paths[0]) == "HDR\nA\n"); // header + worker 0 + CHECK(read_all(paths[1]) == "B\n"); + CHECK(read_all(paths[2]) == "C\nEND\n"); // worker 2 + footer + + cleanup(expected); +} + +TEST_CASE("ShardedWriter - gzip_extension appends .gz to shard names") { + std::string base = tmp_path("test_sharded_gz"); + std::vector expected = {base + ".shard_0.gz", + base + ".shard_1.gz"}; + cleanup(expected); + + Runtime runtime(2); + std::vector paths; + auto task = [&]() -> CoroTask { + auto w = make_sharded_writer(); + if (co_await w->open(base, 2, true, nullptr) != 0) co_return 1; + if (co_await w->write_chunk(0, sv("X")) != 0) co_return 2; + if (co_await w->write_chunk(1, sv("Y")) != 0) co_return 3; + paths = w->output_paths(); + if (co_await w->close() != 0) co_return 4; + co_return 0; + }; + CHECK(runtime.submit(task(), "sharded_gz").get() == 0); + CHECK(paths == expected); + for (const auto& p : expected) CHECK(fs::exists(p)); + cleanup(expected); +} + +TEST_CASE("ShardedWriter - out-of-range worker_idx fails") { + std::string base = tmp_path("test_sharded_oor"); + cleanup({base + ".shard_0"}); + + Runtime runtime(2); + auto task = [&]() -> CoroTask { + auto w = make_sharded_writer(); + if (co_await w->open(base, 1, false, nullptr) != 0) co_return 1; + auto rc = co_await w->write_chunk(5, sv("oops")); + co_await w->close(); + co_return rc == 0 ? 99 : 0; + }; + CHECK(runtime.submit(task(), "sharded_oor").get() == 0); + cleanup({base + ".shard_0"}); +} diff --git a/tests/utilities/fileio/parallel/test_striped_writer.cpp b/tests/utilities/fileio/parallel/test_striped_writer.cpp new file mode 100644 index 00000000..dd839c15 --- /dev/null +++ b/tests/utilities/fileio/parallel/test_striped_writer.cpp @@ -0,0 +1,109 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace dftracer::utils; +using namespace dftracer::utils::coro; +using namespace dftracer::utils::utilities::fileio::parallel; + +namespace { + +std::string tmp_path(const char* name) { + return (fs::temp_directory_path() / name).string(); +} + +std::string read_all(const std::string& path) { + std::ifstream f(path, std::ios::binary); + std::stringstream ss; + ss << f.rdbuf(); + return ss.str(); +} + +ByteView sv(const std::string& s) { return ByteView(s.data(), s.size()); } + +} // namespace + +TEST_CASE("StripedWriter - header, chunks, footer land in single file") { + std::string path = tmp_path("test_striped_basic.txt"); + std::remove(path.c_str()); + + Runtime runtime(2); + auto task = [&]() -> CoroTask { + auto w = make_striped_writer(); + if (co_await w->open(path, 4, false, nullptr) != 0) co_return 1; + if (co_await w->write_header(sv("HDR\n")) != 0) co_return 2; + if (co_await w->write_chunk(0, sv("worker0\n")) != 0) co_return 3; + if (co_await w->write_chunk(1, sv("worker1\n")) != 0) co_return 4; + if (co_await w->write_chunk(2, sv("worker2\n")) != 0) co_return 5; + if (co_await w->write_footer(sv("END\n")) != 0) co_return 6; + if (co_await w->close() != 0) co_return 7; + co_return 0; + }; + + CHECK(runtime.submit(task(), "striped_basic").get() == 0); + + auto content = read_all(path); + // All bytes must be present regardless of interleave order. + CHECK(content.size() == + std::string("HDR\nworker0\nworker1\nworker2\nEND\n").size()); + + std::set lines; + std::stringstream ss(content); + std::string line; + while (std::getline(ss, line)) lines.insert(line); + CHECK(lines.count("HDR") == 1); + CHECK(lines.count("worker0") == 1); + CHECK(lines.count("worker1") == 1); + CHECK(lines.count("worker2") == 1); + CHECK(lines.count("END") == 1); + + fs::remove(path); +} + +TEST_CASE("StripedWriter - empty chunks are no-ops") { + std::string path = tmp_path("test_striped_empty.txt"); + std::remove(path.c_str()); + + Runtime runtime(2); + auto task = [&]() -> CoroTask { + auto w = make_striped_writer(); + if (co_await w->open(path, 2, false, nullptr) != 0) co_return 1; + if (co_await w->write_chunk(0, ByteView()) != 0) co_return 2; + if (co_await w->write_chunk(1, ByteView()) != 0) co_return 3; + if (co_await w->close() != 0) co_return 4; + co_return 0; + }; + CHECK(runtime.submit(task(), "striped_empty").get() == 0); + CHECK(fs::exists(path)); + CHECK(fs::file_size(path) == 0); + fs::remove(path); +} + +TEST_CASE("StripedWriter - output_paths returns single entry") { + std::string path = tmp_path("test_striped_paths.txt"); + std::remove(path.c_str()); + + Runtime runtime(2); + std::vector paths; + auto task = [&]() -> CoroTask { + auto w = make_striped_writer(); + if (co_await w->open(path, 4, false, nullptr) != 0) co_return 1; + paths = w->output_paths(); + co_await w->close(); + co_return 0; + }; + CHECK(runtime.submit(task(), "striped_paths").get() == 0); + REQUIRE(paths.size() == 1); + CHECK(paths[0] == path); + fs::remove(path); +} diff --git a/tests/utilities/indexer/test_index_builder.cpp b/tests/utilities/indexer/test_index_builder.cpp index ce0f4432..789acebe 100644 --- a/tests/utilities/indexer/test_index_builder.cpp +++ b/tests/utilities/indexer/test_index_builder.cpp @@ -1,18 +1,23 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include +#include #include #include #include #include +#include +#include #include #include +#include #include -#include #include +#include #include using namespace dftracer::utils; using namespace dftracer::utils::utilities::indexer; +using namespace dftracer::utils::utilities::composites::dft::visitors; using namespace dftracer::utils::utilities::behaviors; using namespace dft_utils_test; @@ -37,9 +42,7 @@ TEST_SUITE("IndexBuilder") { TestEnvironment env(1000); std::string gz_file = env.create_dft_test_gzip_file(1000); - auto config = - IndexBuildConfig::for_file(gz_file).with_bloom(false).with_manifest( - false); + auto config = IndexBuildConfig::for_file(gz_file).with_manifest(false); IndexBuildResult result; run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask { @@ -65,24 +68,35 @@ TEST_SUITE("IndexBuilder") { std::string json_line = R"({"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":100,"dur":50,"ph":"X"})"; - visitor.on_line(json_line, 0); + simdjson::dom::parser parser; + auto result = parser.parse(json_line.data(), json_line.size()); + REQUIRE(!result.error()); + dftracer::utils::utilities::common::json::JsonValue json( + result.value_unsafe()); + dftracer::utils::utilities::composites::dft::DFTracerEvent ev; + REQUIRE(decltype(ev)::parse(json, ev)); + dftracer::utils::utilities::composites::dft::EventRecord record{ + ev, json, json_line, 0, 0}; + visitor.on_event(record); CHECK(visitor.num_chunks() >= 1); - MESSAGE("BloomVisitor chunks after on_line: ", visitor.num_chunks()); + MESSAGE("BloomVisitor chunks after on_event: ", visitor.num_chunks()); auto db_path = dft_utils_test::make_unique_test_path("bloom_direct"); - db_path += ".idx"; + db_path /= ".dftindex"; + fs::remove_all(db_path); + dftracer::utils::rocksdb::RocksDBManager::instance().reset( + db_path.string()); { IndexDatabase db(db_path.string()); - db.init_base_schema(); - db.init_bloom_schema(); - int fid = db.get_or_create_file_info("test.pfw.gz", 123); - db.begin_transaction(); - visitor.finalize(db, fid); - db.commit_transaction(); + db.init_schema(); + auto writer = db.begin_write(); + int fid = writer->get_or_create_file_info("test.pfw.gz", 123); + visitor.finalize(*writer, fid); + writer->commit(); CHECK(db.has_bloom_data(fid)); } - fs::remove(db_path); + fs::remove_all(db_path); } TEST_CASE("Build with bloom") { @@ -90,9 +104,8 @@ TEST_SUITE("IndexBuilder") { std::string gz_file = env.create_dft_test_gzip_file(1000); auto config = IndexBuildConfig::for_file(gz_file) - .with_bloom(true) - .with_manifest(false) - .with_index_threshold(0); + + .with_manifest(false); IndexBuildResult result; run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask { @@ -119,9 +132,8 @@ TEST_SUITE("IndexBuilder") { std::string gz_file = env.create_dft_test_gzip_file(1000); auto config = IndexBuildConfig::for_file(gz_file) - .with_bloom(false) - .with_manifest(true) - .with_index_threshold(0); + + .with_manifest(true); IndexBuildResult result; run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask { @@ -148,9 +160,8 @@ TEST_SUITE("IndexBuilder") { std::string gz_file = env.create_dft_test_gzip_file(1000); auto config = IndexBuildConfig::for_file(gz_file) - .with_bloom(true) - .with_manifest(true) - .with_index_threshold(0); + + .with_manifest(true); IndexBuildResult result; run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask { @@ -178,10 +189,9 @@ TEST_SUITE("IndexBuilder") { std::string gz_file = env.create_dft_test_gzip_file(1000); auto config = IndexBuildConfig::for_file(gz_file) - .with_bloom(false) + .with_manifest(false) - .with_force_rebuild(false) - .with_index_threshold(0); + .with_force_rebuild(false); IndexBuildResult first; run_coro([&config, &first](CoroScope& scope) -> coro::CoroTask { @@ -213,10 +223,9 @@ TEST_SUITE("IndexBuilder") { std::string gz_file = env.create_dft_test_gzip_file(1000); auto config_normal = IndexBuildConfig::for_file(gz_file) - .with_bloom(false) + .with_manifest(false) - .with_force_rebuild(false) - .with_index_threshold(0); + .with_force_rebuild(false); IndexBuildResult first; run_coro([&config_normal, @@ -231,10 +240,9 @@ TEST_SUITE("IndexBuilder") { REQUIRE(first.success); auto config_force = IndexBuildConfig::for_file(gz_file) - .with_bloom(false) + .with_manifest(false) - .with_force_rebuild(true) - .with_index_threshold(0); + .with_force_rebuild(true); IndexBuildResult second; run_coro([&config_force, @@ -254,9 +262,7 @@ TEST_SUITE("IndexBuilder") { TestEnvironment env(1000); std::string gz_file = env.create_dft_test_gzip_file(1000); - auto config = - IndexBuildConfig::for_file(gz_file).with_bloom(false).with_manifest( - false); + auto config = IndexBuildConfig::for_file(gz_file).with_manifest(false); IndexBuildResult result; run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask { @@ -273,72 +279,14 @@ TEST_SUITE("IndexBuilder") { CHECK(result.total_lines >= 1000); } - TEST_CASE("Incremental bloom add to existing checkpoint-only index") { - TestEnvironment env(1000); - std::string gz_file = env.create_dft_test_gzip_file(1000); - - // First build: checkpoint only - auto config1 = IndexBuildConfig::for_file(gz_file) - .with_bloom(false) - .with_manifest(false) - .with_index_threshold(0); - - IndexBuildResult r1; - run_coro([&config1, &r1](CoroScope& scope) -> coro::CoroTask { - auto builder = std::make_shared(); - UtilityExecutor - exec(builder, - BehaviorChain{}); - r1 = co_await exec.execute_with_context(scope, config1); - }); - REQUIRE(r1.success); - CHECK(r1.index_created); - - // Verify no bloom data yet - { - IndexDatabase db(r1.index_path); - int fid = db.get_file_info_id(internal::get_logical_path(gz_file)); - CHECK(fid >= 0); - CHECK_FALSE(db.has_bloom_data(fid)); - } - - // Second build: add bloom (should NOT rebuild checkpoints) - auto config2 = IndexBuildConfig::for_file(gz_file) - .with_bloom(true) - .with_manifest(false) - .with_index_threshold(0); - - IndexBuildResult r2; - run_coro([&config2, &r2](CoroScope& scope) -> coro::CoroTask { - auto builder = std::make_shared(); - UtilityExecutor - exec(builder, - BehaviorChain{}); - r2 = co_await exec.execute_with_context(scope, config2); - }); - REQUIRE(r2.success); - CHECK_FALSE(r2.was_skipped); - - // Verify bloom data now exists - { - IndexDatabase db(r2.index_path); - int fid = db.get_file_info_id(internal::get_logical_path(gz_file)); - CHECK(fid >= 0); - CHECK(db.has_bloom_data(fid)); - } - } - TEST_CASE("Incremental manifest add to existing index with bloom") { TestEnvironment env(1000); std::string gz_file = env.create_dft_test_gzip_file(1000); // First build: checkpoint + bloom auto config1 = IndexBuildConfig::for_file(gz_file) - .with_bloom(true) - .with_manifest(false) - .with_index_threshold(0); + + .with_manifest(false); IndexBuildResult r1; run_coro([&config1, &r1](CoroScope& scope) -> coro::CoroTask { @@ -360,9 +308,8 @@ TEST_SUITE("IndexBuilder") { // Second build: add manifest (bloom already exists, skip it) auto config2 = IndexBuildConfig::for_file(gz_file) - .with_bloom(false) - .with_manifest(true) - .with_index_threshold(0); + + .with_manifest(true); IndexBuildResult r2; run_coro([&config2, &r2](CoroScope& scope) -> coro::CoroTask { @@ -391,9 +338,8 @@ TEST_SUITE("IndexBuilder") { // Build with bloom + manifest auto config1 = IndexBuildConfig::for_file(gz_file) - .with_bloom(true) - .with_manifest(true) - .with_index_threshold(0); + + .with_manifest(true); IndexBuildResult r1; run_coro([&config1, &r1](CoroScope& scope) -> coro::CoroTask { diff --git a/tests/utilities/indexer/test_index_database.cpp b/tests/utilities/indexer/test_index_database.cpp index b8345791..d5aeed17 100644 --- a/tests/utilities/indexer/test_index_database.cpp +++ b/tests/utilities/indexer/test_index_database.cpp @@ -1,14 +1,17 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include +#include #include #include #include #include -namespace fs = std::filesystem; +using dftracer::utils::utilities::indexer::ChunkStatistics; using dftracer::utils::utilities::indexer::IndexDatabase; +using dftracer::utils::utilities::indexer::IndexDatabaseWriterContext; +using dftracer::utils::utilities::indexer::MergedStatisticsResult; TEST_SUITE("IndexDatabase") { TEST_CASE("normalizes legacy .idx-style input to root-local .dftindex") { @@ -27,10 +30,23 @@ TEST_SUITE("IndexDatabase") { IndexDatabase db1((root / ".dftindex").string()); IndexDatabase db2((root / "other-name.idx").string()); - db1.init_base_schema(); - db2.init_base_schema(); + { + auto writer = db1.begin_write(); + writer->init_schema(); + writer->commit(); + } + { + auto writer = db2.begin_write(); + writer->init_schema(); + writer->commit(); + } - int id1 = db1.get_or_create_file_info("a.pfw.gz", 0x1111); + int id1; + { + auto writer = db1.begin_write(); + id1 = writer->get_or_create_file_info("a.pfw.gz", 0x1111); + writer->commit(); + } int id2 = db2.get_file_info_id("a.pfw.gz"); CHECK(id1 > 0); @@ -42,29 +58,41 @@ TEST_SUITE("IndexDatabase") { fs::create_directories(root); IndexDatabase db((root / ".dftindex").string()); - db.init_base_schema(); - db.init_bloom_schema(); - db.init_manifest_schema(); - - const int file_id = db.get_or_create_file_info("trace.pfw.gz", 0xAAAA); - - std::vector blob = {0xDE, 0xAD, 0xBE, 0xEF}; - db.insert_chunk_bloom_filter(file_id, 0, "name", std::span(blob), 4); - db.insert_file_bloom_filter(file_id, "name", std::span(blob), 4); - db.insert_index_dimension(file_id, "name"); - db.insert_hash_resolution(file_id, "fhash", "hashA", "resolvedA"); - db.insert_event_range(file_id, 0, "POSIX", "read", - std::vector{1, 2, 3}); - db.insert_metadata_lines(file_id, 0, "HH", - std::vector{0, 4}); + + int file_id; + { + auto writer = db.begin_write(); + writer->init_schema(); + + file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xAAAA); + + std::vector blob = {0xDE, 0xAD, 0xBE, 0xEF}; + writer->insert_chunk_bloom_filter(file_id, 0, "name", + std::span(blob), 4); + writer->insert_file_bloom_filter(file_id, "name", std::span(blob), + 4); + writer->insert_index_dimension(file_id, "name"); + writer->insert_hash_table_entry(0, "hashA", "resolvedA"); + writer->insert_event_range(file_id, 0, "POSIX", "read", + std::vector{1, 2, 3}); + writer->insert_metadata_lines(file_id, 0, "HH", + std::vector{0, 4}); + writer->commit(); + } CHECK(db.has_bloom_data(file_id)); CHECK(db.has_manifest_data(file_id)); CHECK(db.query_file_bloom_filter(file_id, "name").has_value()); - CHECK(db.query_resolved_by_hash("fhash", "hashA").has_value()); + CHECK(db.resolve_hash(IndexDatabase::HashType::FILE, "hashA") + .has_value()); - const int rebuilt_id = - db.get_or_create_file_info("trace.pfw.gz", 0xBBBB); + int rebuilt_id; + { + auto writer = db.begin_write(); + rebuilt_id = + writer->get_or_create_file_info("trace.pfw.gz", 0xBBBB); + writer->commit(); + } CHECK(rebuilt_id == file_id); CHECK_FALSE(db.has_bloom_data(file_id)); @@ -73,26 +101,209 @@ TEST_SUITE("IndexDatabase") { CHECK(db.query_chunk_bloom_filters(file_id, "name").empty()); CHECK(db.query_event_ranges(file_id).empty()); CHECK(db.query_metadata_lines(file_id).empty()); - CHECK_FALSE(db.query_resolved_by_hash("fhash", "hashA").has_value()); + CHECK(db.resolve_hash(IndexDatabase::HashType::FILE, "hashA") + .has_value()); } - TEST_CASE("rollback discards transactional writes") { - auto root = dft_utils_test::make_unique_test_path("idx_rollback"); + TEST_CASE("writer context batches multiple files and all are readable") { + auto root = dft_utils_test::make_unique_test_path("idx_writer_ctx"); fs::create_directories(root); IndexDatabase db((root / ".dftindex").string()); - db.init_base_schema(); - db.init_bloom_schema(); + db.init_schema(); - const int file_id = db.get_or_create_file_info("trace.pfw.gz", 0xAAAA); - std::vector blob = {0xAB, 0xCD}; + static constexpr int NUM_FILES = 100; + static constexpr int BATCH_SIZE = 10; - db.begin_transaction(); - db.insert_file_bloom_filter(file_id, "name", std::span(blob), 2); - db.insert_hash_resolution(file_id, "fhash", "hashA", "resolvedA"); - db.rollback_transaction(); + // Create file IDs first + std::vector file_ids; + { + auto writer = db.begin_write(); + for (int i = 0; i < NUM_FILES; ++i) { + auto name = "file_" + std::to_string(i) + ".pfw.gz"; + int fid = writer->get_or_create_file_info(name, i + 1); + file_ids.push_back(fid); + } + writer->commit(); + } + CHECK(file_ids.size() == NUM_FILES); - CHECK_FALSE(db.query_file_bloom_filter(file_id, "name").has_value()); - CHECK_FALSE(db.query_resolved_by_hash("fhash", "hashA").has_value()); + // Write scalar stats in batches + for (int batch_start = 0; batch_start < NUM_FILES; + batch_start += BATCH_SIZE) { + auto writer = db.begin_write(); + int batch_end = std::min(batch_start + BATCH_SIZE, NUM_FILES); + for (int i = batch_start; i < batch_end; ++i) { + ChunkStatistics stats; + stats.total_events = static_cast(i + 1) * 100; + writer->insert_file_scalar_stats(file_ids[i], stats, 1); + } + writer->commit(); + } + + // Verify ALL data is readable + auto results = db.query_file_scalar_stats_batch(file_ids); + CHECK(results.size() == NUM_FILES); + + std::uint64_t total_events = 0; + for (int i = 0; i < NUM_FILES; ++i) { + auto it = results.find(file_ids[i]); + REQUIRE(it != results.end()); + CHECK(it->second.stats.total_events == + static_cast(i + 1) * 100); + total_events += it->second.stats.total_events; + } + CHECK(total_events == 505000); // sum of 100+200+...+10000 + } + + TEST_CASE("PID manifest - insert and query single file PIDs") { + auto root = dft_utils_test::make_unique_test_path("idx_pid_single"); + fs::create_directories(root); + + IndexDatabase db((root / ".dftindex").string()); + + int file_id; + { + auto writer = db.begin_write(); + writer->init_schema(); + file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xAAAA); + + std::unordered_set pids = {1234, 5678, 9012}; + writer->insert_file_pids(file_id, pids); + writer->commit(); + } + + auto result = db.query_file_pids(file_id); + CHECK(result.size() == 3); + CHECK(result.count(1234) == 1); + CHECK(result.count(5678) == 1); + CHECK(result.count(9012) == 1); + } + + TEST_CASE("PID manifest - query non-existent file returns empty set") { + auto root = dft_utils_test::make_unique_test_path("idx_pid_empty"); + fs::create_directories(root); + + IndexDatabase db((root / ".dftindex").string()); + db.init_schema(); + + auto result = db.query_file_pids(999); + CHECK(result.empty()); + } + + TEST_CASE("PID manifest - query all file PIDs") { + auto root = dft_utils_test::make_unique_test_path("idx_pid_all"); + fs::create_directories(root); + + IndexDatabase db((root / ".dftindex").string()); + + int file_id1, file_id2, file_id3; + { + auto writer = db.begin_write(); + writer->init_schema(); + + file_id1 = writer->get_or_create_file_info("trace1.pfw.gz", 0xAAA1); + file_id2 = writer->get_or_create_file_info("trace2.pfw.gz", 0xAAA2); + file_id3 = writer->get_or_create_file_info("trace3.pfw.gz", 0xAAA3); + + writer->insert_file_pids(file_id1, {1000, 1001}); + writer->insert_file_pids(file_id2, {1000, 2000, 2001}); + writer->insert_file_pids(file_id3, {3000}); + writer->commit(); + } + + auto all_pids = db.query_all_file_pids(); + CHECK(all_pids.size() == 3); + + CHECK(all_pids[file_id1].size() == 2); + CHECK(all_pids[file_id1].count(1000) == 1); + CHECK(all_pids[file_id1].count(1001) == 1); + + CHECK(all_pids[file_id2].size() == 3); + CHECK(all_pids[file_id2].count(1000) == 1); + CHECK(all_pids[file_id2].count(2000) == 1); + CHECK(all_pids[file_id2].count(2001) == 1); + + CHECK(all_pids[file_id3].size() == 1); + CHECK(all_pids[file_id3].count(3000) == 1); + } + + TEST_CASE("PID manifest - large PIDs") { + auto root = dft_utils_test::make_unique_test_path("idx_pid_large"); + fs::create_directories(root); + + IndexDatabase db((root / ".dftindex").string()); + + int file_id; + { + auto writer = db.begin_write(); + writer->init_schema(); + file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xBBBB); + + // Use large PID values to test varint encoding + std::unordered_set pids = { + 0xFFFFFFFFULL, // 32-bit max + 0x100000000ULL, // Just over 32-bit + 0xFFFFFFFFFFFFFFFFULL // 64-bit max + }; + writer->insert_file_pids(file_id, pids); + writer->commit(); + } + + auto result = db.query_file_pids(file_id); + CHECK(result.size() == 3); + CHECK(result.count(0xFFFFFFFFULL) == 1); + CHECK(result.count(0x100000000ULL) == 1); + CHECK(result.count(0xFFFFFFFFFFFFFFFFULL) == 1); + } + + TEST_CASE("PID manifest - empty PID set not stored") { + auto root = dft_utils_test::make_unique_test_path("idx_pid_empty_set"); + fs::create_directories(root); + + IndexDatabase db((root / ".dftindex").string()); + + int file_id; + { + auto writer = db.begin_write(); + writer->init_schema(); + file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xCCCC); + + std::unordered_set empty_pids; + writer->insert_file_pids(file_id, empty_pids); + writer->commit(); + } + + auto result = db.query_file_pids(file_id); + CHECK(result.empty()); + } + + TEST_CASE("PID manifest - rebuild clears PIDs") { + auto root = dft_utils_test::make_unique_test_path("idx_pid_rebuild"); + fs::create_directories(root); + + IndexDatabase db((root / ".dftindex").string()); + + int file_id; + { + auto writer = db.begin_write(); + writer->init_schema(); + file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xDDDD); + writer->insert_file_pids(file_id, {1234, 5678}); + writer->commit(); + } + + CHECK(db.query_file_pids(file_id).size() == 2); + + // Rebuild with new checksum clears data + { + auto writer = db.begin_write(); + int rebuilt_id = + writer->get_or_create_file_info("trace.pfw.gz", 0xEEEE); + writer->commit(); + CHECK(rebuilt_id == file_id); + } + + CHECK(db.query_file_pids(file_id).empty()); } } diff --git a/tests/utilities/indexer/test_provenance_database.cpp b/tests/utilities/indexer/test_provenance_database.cpp index 5686b01d..ba7dc7a7 100644 --- a/tests/utilities/indexer/test_provenance_database.cpp +++ b/tests/utilities/indexer/test_provenance_database.cpp @@ -4,7 +4,6 @@ #include #include -namespace fs = std::filesystem; using namespace dftracer::utils::utilities::indexer; TEST_SUITE("ProvenanceDatabase") { @@ -35,7 +34,7 @@ TEST_SUITE("ProvenanceDatabase") { db.insert_info(file_id, "tool", "dftracer_organize"); db.insert_group(file_id, "group0", "cat == POSIX"); db.insert_source(file_id, 7, "/src/a.pfw.gz", 12, "hash7"); - db.insert_segment(file_id, 7, 3, 100, 140, 9); + db.insert_segment(file_id, 7, 3, 0, 100, 140, 9); auto sources = db.query_sources(file_id); REQUIRE(sources.size() == 1); @@ -72,15 +71,13 @@ TEST_SUITE("ProvenanceDatabase") { CHECK(file_b > 0); CHECK(file_a != file_b); - db.begin_transaction(); db.insert_group(file_a, "io", R"(cat == "POSIX")"); db.insert_source(file_a, 0, "/src/trace0.pfw.gz", 3, "ha"); - db.insert_segment(file_a, 0, 1, 0, 5, 3); + db.insert_segment(file_a, 0, 1, 0, 0, 5, 3); db.insert_group(file_b, "compute", R"(cat == "APP")"); db.insert_source(file_b, 1, "/src/trace1.pfw.gz", 2, "hb"); - db.insert_segment(file_b, 1, 0, 0, 3, 1); - db.commit_transaction(); + db.insert_segment(file_b, 1, 0, 0, 0, 3, 1); CHECK(db.get_file_info_id(out_a) == file_a); CHECK(db.get_file_info_id(out_b) == file_b); @@ -106,22 +103,18 @@ TEST_SUITE("ProvenanceDatabase") { const auto out = (root / "group.pfw.gz").string(); const int original_id = db.get_or_create_file_info(out, 0x1111); - db.begin_transaction(); db.insert_info(original_id, "tool", "dftracer_organize"); db.insert_group(original_id, "io", R"(cat == "POSIX")"); db.insert_source(original_id, 0, "/src/trace0.pfw.gz", 4, "old"); - db.insert_segment(original_id, 0, 0, 0, 4, 2); - db.commit_transaction(); + db.insert_segment(original_id, 0, 0, 0, 0, 4, 2); const int rebuilt_id = db.get_or_create_file_info(out, 0x2222); CHECK(rebuilt_id == original_id); - db.begin_transaction(); db.insert_info(rebuilt_id, "tool", "dftracer_organize_v2"); db.insert_group(rebuilt_id, "io", R"(cat == "MPI")"); db.insert_source(rebuilt_id, 0, "/src/trace0.pfw.gz", 8, "new"); - db.insert_segment(rebuilt_id, 0, 0, 10, 18, 5); - db.commit_transaction(); + db.insert_segment(rebuilt_id, 0, 0, 0, 10, 18, 5); CHECK(db.query_info(rebuilt_id, "tool") == "dftracer_organize_v2"); CHECK(db.query_group_predicate(rebuilt_id) == R"(cat == "MPI")"); @@ -138,27 +131,5 @@ TEST_SUITE("ProvenanceDatabase") { CHECK(segments[0].event_count == 5); } - TEST_CASE("rollback discards provenance writes") { - auto root = dft_utils_test::make_unique_test_path("prov_rollback"); - fs::create_directories(root); - - ProvenanceDatabase db((root / ".dftindex").string()); - db.init_schema(); - - const int file_id = - db.get_or_create_file_info((root / "out.pfw.gz").string(), 0xCAFE); - - db.begin_transaction(); - db.insert_info(file_id, "tool", "dftracer_organize"); - db.insert_group(file_id, "group0", "cat == POSIX"); - db.insert_source(file_id, 7, "/src/a.pfw.gz", 12, "hash7"); - db.insert_segment(file_id, 7, 3, 100, 140, 9); - db.rollback_transaction(); - - CHECK(db.query_info(file_id, "tool").empty()); - CHECK(db.query_group_name(file_id).empty()); - CHECK(db.query_group_predicate(file_id).empty()); - CHECK(db.query_sources(file_id).empty()); - CHECK(db.query_segments(file_id, 7).empty()); - } + // Transaction rollback test removed — writes commit immediately. } diff --git a/tests/utilities/indexer/test_rocksdb_storage.cpp b/tests/utilities/indexer/test_rocksdb_storage.cpp index 48dfeb97..e28dd2aa 100644 --- a/tests/utilities/indexer/test_rocksdb_storage.cpp +++ b/tests/utilities/indexer/test_rocksdb_storage.cpp @@ -12,7 +12,6 @@ #include #include -namespace fs = std::filesystem; using dftracer::utils::rocksdb::KeyBuilder; using dftracer::utils::rocksdb::KeyCodec; using dftracer::utils::rocksdb::RocksDatabase; @@ -80,18 +79,23 @@ TEST_SUITE("RocksDBStorage") { auto path = (root / ".dftindex").string(); auto& manager = RocksDBManager::instance(); - auto first = - manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); - REQUIRE(first != nullptr); - auto* first_raw = first.get(); - - manager.reset(path); - first.reset(); + std::weak_ptr first_weak; + { + auto first = + manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); + REQUIRE(first != nullptr); + first_weak = first; + manager.reset(path); + } + // After reset() + the only strong owner going out of scope, the old + // instance must have been destroyed (RocksDB holds a per-process file + // lock, so a stale cached instance would prevent reopening below). + CHECK(first_weak.expired()); auto second = manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); REQUIRE(second != nullptr); - CHECK(second.get() != first_raw); + CHECK(second->is_open()); } TEST_CASE("manager shutdown clears cached instances") { @@ -102,18 +106,20 @@ TEST_SUITE("RocksDBStorage") { auto path = (root / ".dftindex").string(); auto& manager = RocksDBManager::instance(); - auto first = - manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); - REQUIRE(first != nullptr); - auto* first_raw = first.get(); - - manager.shutdown(); - first.reset(); + std::weak_ptr first_weak; + { + auto first = + manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); + REQUIRE(first != nullptr); + first_weak = first; + manager.shutdown(); + } + CHECK(first_weak.expired()); auto second = manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); REQUIRE(second != nullptr); - CHECK(second.get() != first_raw); + CHECK(second->is_open()); } TEST_CASE("manager rejects read-only upgrade while handle is alive") { diff --git a/tests/utilities/indexer/test_sst_ingest_spike.cpp b/tests/utilities/indexer/test_sst_ingest_spike.cpp new file mode 100644 index 00000000..60abab7a --- /dev/null +++ b/tests/utilities/indexer/test_sst_ingest_spike.cpp @@ -0,0 +1,469 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using dftracer::utils::utilities::composites::dft::indexing:: + ChunkDimensionStats; +using dftracer::utils::utilities::composites::dft::indexing::ChunkStatistics; +using dftracer::utils::utilities::indexer::IndexDatabase; +using dftracer::utils::utilities::indexer::IndexDatabaseSstWriterContext; +using dftracer::utils::utilities::indexer::SstArtifactRegistry; +using dftracer::utils::utilities::indexer::internal::IndexerCheckpoint; + +namespace { + +IndexerCheckpoint make_checkpoint(std::uint64_t idx, std::uint64_t uc_offset, + std::uint64_t num_lines) { + IndexerCheckpoint cp{}; + cp.checkpoint_idx = idx; + cp.uc_offset = uc_offset; + cp.uc_size = 64 * 1024; + cp.c_offset = uc_offset / 2; + cp.c_size = 32 * 1024; + cp.bits = 8; + cp.dict_compressed = std::vector{0xAA, 0xBB, 0xCC}; + cp.num_lines = num_lines; + cp.first_line_num = idx * num_lines + 1; + cp.last_line_num = (idx + 1) * num_lines; + return cp; +} + +ChunkStatistics make_chunk_stats(std::uint64_t total_events) { + ChunkStatistics stats; + stats.total_events = total_events; + stats.min_timestamp_us = 1000; + stats.max_timestamp_us = 9000; + stats.name_counts["read"] = total_events / 2; + stats.name_counts["write"] = total_events - total_events / 2; + stats.category_counts["posix"] = total_events; + stats.pid_tid_counts["1:1"] = total_events; + return stats; +} + +ChunkDimensionStats make_dim_stats(std::string_view dim, std::uint64_t distinct, + std::string_view min_val, + std::string_view max_val) { + ChunkDimensionStats ds; + ds.dimension = std::string(dim); + ds.distinct_count = distinct; + ds.min_value = std::string(min_val); + ds.max_value = std::string(max_val); + ds.value_type = "string"; + return ds; +} + +struct Fixture { + IndexerCheckpoint cp_a = make_checkpoint(0, 0, 100); + IndexerCheckpoint cp_b = make_checkpoint(1, 64 * 1024, 100); + + std::vector read_lines{1, 5, 17, 42}; + std::vector write_lines{2, 8, 23}; + std::vector md_proc_lines{3, 9}; + std::unordered_set pids{101, 102, 103}; + + std::vector bloom_blob_a{0x11, 0x22, 0x33, 0x44}; + std::vector bloom_blob_b{0x55, 0x66, 0x77, 0x88}; + + ChunkStatistics chunk_stats_a = make_chunk_stats(60); + ChunkStatistics chunk_stats_b = make_chunk_stats(80); + ChunkStatistics file_stats = make_chunk_stats(140); + + ChunkDimensionStats dim_stats_a = + make_dim_stats("name", 3, "fsync", "read"); + ChunkDimensionStats dim_stats_b = + make_dim_stats("name", 5, "close", "write"); + + template + void populate(Sink& sink, int file_id) { + sink.insert_checkpoint(file_id, cp_a); + sink.insert_checkpoint(file_id, cp_b); + sink.insert_file_metadata(file_id, /*checkpoint_size=*/64 * 1024, + /*total_lines=*/200, + /*total_uc_size=*/128 * 1024); + sink.insert_event_range(file_id, cp_a.checkpoint_idx, "posix", "read", + read_lines); + sink.insert_event_range(file_id, cp_b.checkpoint_idx, "posix", "write", + write_lines); + sink.insert_metadata_lines(file_id, cp_a.checkpoint_idx, "PR", + md_proc_lines); + sink.insert_file_pids(file_id, pids); + + sink.insert_chunk_bloom_filter( + file_id, cp_a.checkpoint_idx, "name", + std::span(bloom_blob_a), /*num_entries=*/4); + sink.insert_chunk_bloom_filter( + file_id, cp_b.checkpoint_idx, "name", + std::span(bloom_blob_b), /*num_entries=*/5); + sink.insert_file_bloom_filter( + file_id, "name", std::span(bloom_blob_a), + /*num_entries=*/8); + + sink.insert_chunk_statistics(file_id, cp_a.checkpoint_idx, + chunk_stats_a); + sink.insert_chunk_statistics(file_id, cp_b.checkpoint_idx, + chunk_stats_b); + sink.insert_file_scalar_stats(file_id, file_stats, /*num_chunks=*/2); + sink.insert_file_category_counts(file_id, file_stats.category_counts); + sink.insert_file_pid_tid_counts(file_id, file_stats.pid_tid_counts); + sink.insert_file_name_counts(file_id, file_stats.name_counts); + + sink.insert_index_dimension(file_id, "name"); + sink.insert_index_dimension(file_id, "cat"); + sink.insert_chunk_dimension_stats(file_id, cp_a.checkpoint_idx, + dim_stats_a); + sink.insert_chunk_dimension_stats(file_id, cp_b.checkpoint_idx, + dim_stats_b); + + using dftracer::utils::utilities::hash::fnv1a_hash; + const auto read_id = fnv1a_hash(std::string_view{"read"}); + const auto write_id = fnv1a_hash(std::string_view{"write"}); + sink.insert_name_dictionary_entry(read_id, "read"); + sink.insert_name_dictionary_entry(write_id, "write"); + sink.insert_name_file_posting(read_id, file_id); + sink.insert_name_file_posting(write_id, file_id); + sink.insert_name_chunk_posting(read_id, file_id, cp_a.checkpoint_idx); + sink.insert_name_chunk_posting(write_id, file_id, cp_b.checkpoint_idx); + + sink.insert_hash_table_entry( + static_cast(IndexDatabase::HashType::FILE), "fh_1", + "/path/to/trace.pfw.gz"); + sink.insert_hash_table_entry( + static_cast(IndexDatabase::HashType::HOST), "hh_1", + "host-1"); + sink.insert_hash_table_entry( + static_cast(IndexDatabase::HashType::STRING), "sh_1", + "some-string"); + + // Aggregation / system_metrics sink writes. SstFileWriter requires + // strictly ascending keys within a single SST, so the raw sink + // API here exercises one merge per key. Cross-flush merges + // targeting the same key are the AggregationVisitor's concern: it + // rotates its SstWriterContext per flush so each SST is key-unique. + sink.insert_aggregation_put("\xFF\xFD\x01", "name-one"); + sink.insert_aggregation_put("\xFF\xFD\x02", "name-two"); + sink.insert_aggregation_merge("agg-key-1", "operand-1"); + sink.insert_aggregation_merge("agg-key-2", "operand-2"); + sink.insert_system_metrics_merge("sys-key-1", "sys-1"); + sink.insert_system_metrics_merge("sys-key-2", "sys-2"); + } +}; + +void compare_cf_entries(const IndexDatabase& db_a, const IndexDatabase& db_b, + std::string_view cf_name); + +void check_round_trip(const IndexDatabase& db_a, const IndexDatabase& db_b, + int file_id) { + CHECK(db_a.get_checkpoint_size(file_id) == + db_b.get_checkpoint_size(file_id)); + CHECK(db_a.get_num_lines(file_id) == db_b.get_num_lines(file_id)); + CHECK(db_a.get_max_bytes(file_id) == db_b.get_max_bytes(file_id)); + + auto cps_a = db_a.query_checkpoints(file_id); + auto cps_b = db_b.query_checkpoints(file_id); + REQUIRE(cps_a.size() == cps_b.size()); + for (std::size_t i = 0; i < cps_a.size(); ++i) { + CHECK(cps_a[i].checkpoint_idx == cps_b[i].checkpoint_idx); + CHECK(cps_a[i].uc_offset == cps_b[i].uc_offset); + CHECK(cps_a[i].uc_size == cps_b[i].uc_size); + CHECK(cps_a[i].c_offset == cps_b[i].c_offset); + CHECK(cps_a[i].c_size == cps_b[i].c_size); + CHECK(cps_a[i].num_lines == cps_b[i].num_lines); + CHECK(cps_a[i].first_line_num == cps_b[i].first_line_num); + CHECK(cps_a[i].last_line_num == cps_b[i].last_line_num); + } + + auto er_a = db_a.query_event_ranges(file_id); + auto er_b = db_b.query_event_ranges(file_id); + REQUIRE(er_a.size() == er_b.size()); + for (std::size_t i = 0; i < er_a.size(); ++i) { + CHECK(er_a[i].checkpoint_idx == er_b[i].checkpoint_idx); + CHECK(er_a[i].cat == er_b[i].cat); + CHECK(er_a[i].name == er_b[i].name); + CHECK(er_a[i].line_numbers == er_b[i].line_numbers); + } + + auto md_a = db_a.query_metadata_lines(file_id); + auto md_b = db_b.query_metadata_lines(file_id); + REQUIRE(md_a.size() == md_b.size()); + for (std::size_t i = 0; i < md_a.size(); ++i) { + CHECK(md_a[i].checkpoint_idx == md_b[i].checkpoint_idx); + CHECK(md_a[i].meta_type == md_b[i].meta_type); + CHECK(md_a[i].line_numbers == md_b[i].line_numbers); + } + + CHECK(db_a.query_file_pids(file_id) == db_b.query_file_pids(file_id)); + + auto cbf_a = db_a.query_chunk_bloom_filters(file_id, "name"); + auto cbf_b = db_b.query_chunk_bloom_filters(file_id, "name"); + REQUIRE(cbf_a.size() == cbf_b.size()); + for (std::size_t i = 0; i < cbf_a.size(); ++i) { + CHECK(cbf_a[i].checkpoint_idx == cbf_b[i].checkpoint_idx); + CHECK(cbf_a[i].num_entries == cbf_b[i].num_entries); + CHECK(cbf_a[i].bloom_data == cbf_b[i].bloom_data); + } + + auto fbf_a = db_a.query_file_bloom_filter(file_id, "name"); + auto fbf_b = db_b.query_file_bloom_filter(file_id, "name"); + REQUIRE(fbf_a.has_value()); + REQUIRE(fbf_b.has_value()); + CHECK(fbf_a->num_entries == fbf_b->num_entries); + CHECK(fbf_a->bloom_data == fbf_b->bloom_data); + + auto cs_a = db_a.query_chunk_statistics(file_id); + auto cs_b = db_b.query_chunk_statistics(file_id); + REQUIRE(cs_a.size() == cs_b.size()); + for (std::size_t i = 0; i < cs_a.size(); ++i) { + CHECK(cs_a[i].checkpoint_idx == cs_b[i].checkpoint_idx); + CHECK(cs_a[i].stats.total_events == cs_b[i].stats.total_events); + CHECK(cs_a[i].stats.min_timestamp_us == cs_b[i].stats.min_timestamp_us); + CHECK(cs_a[i].stats.max_timestamp_us == cs_b[i].stats.max_timestamp_us); + } + + auto fss_a = db_a.query_file_scalar_stats_batch({file_id}); + auto fss_b = db_b.query_file_scalar_stats_batch({file_id}); + REQUIRE(fss_a.count(file_id) == 1); + REQUIRE(fss_b.count(file_id) == 1); + CHECK(fss_a[file_id].stats.total_events == + fss_b[file_id].stats.total_events); + CHECK(fss_a[file_id].num_chunks == fss_b[file_id].num_chunks); + + auto cat_a = db_a.query_file_category_counts_batch({file_id}); + auto cat_b = db_b.query_file_category_counts_batch({file_id}); + REQUIRE(cat_a.count(file_id) == 1); + REQUIRE(cat_b.count(file_id) == 1); + CHECK(cat_a[file_id].size() == cat_b[file_id].size()); + for (const auto& [k, v] : cat_a[file_id]) { + auto it = cat_b[file_id].find(k); + REQUIRE(it != cat_b[file_id].end()); + CHECK(it->second == v); + } + + auto pt_a = db_a.query_file_pid_tid_counts_batch({file_id}); + auto pt_b = db_b.query_file_pid_tid_counts_batch({file_id}); + REQUIRE(pt_a.count(file_id) == 1); + REQUIRE(pt_b.count(file_id) == 1); + CHECK(pt_a[file_id].size() == pt_b[file_id].size()); + + auto ns_a = db_a.query_file_name_summaries_batch({file_id}); + auto ns_b = db_b.query_file_name_summaries_batch({file_id}); + REQUIRE(ns_a.count(file_id) == 1); + REQUIRE(ns_b.count(file_id) == 1); + CHECK(ns_a[file_id].counts.size() == ns_b[file_id].counts.size()); + CHECK(ns_a[file_id].unique_count == ns_b[file_id].unique_count); + + auto dims_a = db_a.query_index_dimensions(file_id); + auto dims_b = db_b.query_index_dimensions(file_id); + std::sort(dims_a.begin(), dims_a.end()); + std::sort(dims_b.begin(), dims_b.end()); + CHECK(dims_a == dims_b); + + auto cds_a = db_a.query_chunk_dimension_stats(file_id); + auto cds_b = db_b.query_chunk_dimension_stats(file_id); + REQUIRE(cds_a.size() == cds_b.size()); + for (std::size_t i = 0; i < cds_a.size(); ++i) { + CHECK(cds_a[i].checkpoint_idx == cds_b[i].checkpoint_idx); + CHECK(cds_a[i].dimension == cds_b[i].dimension); + CHECK(cds_a[i].distinct_count == cds_b[i].distinct_count); + CHECK(cds_a[i].min_value == cds_b[i].min_value); + CHECK(cds_a[i].max_value == cds_b[i].max_value); + } + + CHECK(db_a.query_name_id("read") == db_b.query_name_id("read")); + CHECK(db_a.query_name_id("write") == db_b.query_name_id("write")); + CHECK(db_a.query_name_by_id(*db_a.query_name_id("read")) == + db_b.query_name_by_id(*db_b.query_name_id("read"))); + + auto fp_a = db_a.query_name_file_postings("read"); + auto fp_b = db_b.query_name_file_postings("read"); + std::sort(fp_a.begin(), fp_a.end()); + std::sort(fp_b.begin(), fp_b.end()); + CHECK(fp_a == fp_b); + + auto cp_read_a = db_a.query_name_chunk_postings("read", file_id); + auto cp_read_b = db_b.query_name_chunk_postings("read", file_id); + std::sort(cp_read_a.begin(), cp_read_a.end()); + std::sort(cp_read_b.begin(), cp_read_b.end()); + CHECK(cp_read_a == cp_read_b); + + CHECK(db_a.resolve_hash(IndexDatabase::HashType::FILE, "fh_1") == + db_b.resolve_hash(IndexDatabase::HashType::FILE, "fh_1")); + CHECK(db_a.resolve_hash(IndexDatabase::HashType::HOST, "hh_1") == + db_b.resolve_hash(IndexDatabase::HashType::HOST, "hh_1")); + CHECK(db_a.resolve_name_to_hash(IndexDatabase::HashType::FILE, + "/path/to/trace.pfw.gz") == + db_b.resolve_name_to_hash(IndexDatabase::HashType::FILE, + "/path/to/trace.pfw.gz")); + CHECK(db_a.query_hash_table(IndexDatabase::HashType::FILE) == + db_b.query_hash_table(IndexDatabase::HashType::FILE)); + + namespace cf = dftracer::utils::rocksdb::cf; + compare_cf_entries(db_a, db_b, cf::AGGREGATION); + compare_cf_entries(db_a, db_b, cf::SYSTEM_METRICS); +} + +/// Compare all entries in `cf` between two databases. Uses raw iteration +/// over the CF; for merge-operand CFs rocksdb combines operands on read +/// automatically, so both DBs must return byte-identical values. +void compare_cf_entries(const IndexDatabase& db_a, const IndexDatabase& db_b, + std::string_view cf_name) { + auto collect = + [&](const IndexDatabase& db) -> std::map { + std::map out; + auto it = db.db()->new_iterator(cf_name); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + out.emplace(std::string(it->key().data(), it->key().size()), + std::string(it->value().data(), it->value().size())); + } + return out; + }; + auto a_map = collect(db_a); + auto b_map = collect(db_b); + REQUIRE(a_map.size() == b_map.size()); + for (auto& [k, v] : a_map) { + auto it = b_map.find(k); + REQUIRE(it != b_map.end()); + CHECK(it->second == v); + } +} + +void check_root_summaries(const IndexDatabase& db_a, + const IndexDatabase& db_b) { + auto r_a = db_a.query_root_scalar_stats(); + auto r_b = db_b.query_root_scalar_stats(); + REQUIRE(r_a.has_value()); + REQUIRE(r_b.has_value()); + CHECK(r_a->stats.total_events == r_b->stats.total_events); + CHECK(r_a->num_chunks == r_b->num_chunks); + CHECK(r_a->num_files == r_b->num_files); + + CHECK(db_a.query_root_category_counts() == + db_b.query_root_category_counts()); + CHECK(db_a.query_root_name_counts() == db_b.query_root_name_counts()); + CHECK(db_a.query_root_pid_tid_counts() == db_b.query_root_pid_tid_counts()); +} + +} // namespace + +TEST_SUITE("IndexDatabaseSstWriterContext") { + TEST_CASE("round-trip: SST ingest matches direct RocksDB writes") { + auto root_a = dft_utils_test::make_unique_test_path("sst_spike_db_a"); + auto root_b = dft_utils_test::make_unique_test_path("sst_spike_db_b"); + auto staging = + dft_utils_test::make_unique_test_path("sst_spike_staging"); + fs::create_directories(root_a); + fs::create_directories(root_b); + fs::create_directories(staging); + + Fixture f; + const int file_id = 1; + + IndexDatabase db_a((root_a / ".dftindex").string()); + { + auto w = db_a.begin_write(); + w->init_schema(); + f.populate(*w, file_id); + w->commit(); + } + + IndexDatabase db_b((root_b / ".dftindex").string()); + { + auto w = db_b.begin_write(); + w->init_schema(); + w->commit(); + } + + SstArtifactRegistry registry; + { + IndexDatabaseSstWriterContext sst(staging.string(), "batch_0"); + f.populate(sst, file_id); + registry.append(sst.commit()); + } + + CHECK(registry.metadata().size() == 1); + CHECK(registry.checkpoints().size() == 1); + CHECK(registry.manifest().size() == 1); + + db_b.bulk_ingest(registry); + + // Both paths must converge on the same root summaries after an + // explicit rebuild on each side. + db_a.rebuild_root_summaries(); + db_b.rebuild_root_summaries(); + + check_round_trip(db_a, db_b, file_id); + check_root_summaries(db_a, db_b); + } + + TEST_CASE("bulk_ingest composes across multiple disjoint batches") { + auto root_a = dft_utils_test::make_unique_test_path("sst_multi_db_a"); + auto root_b = dft_utils_test::make_unique_test_path("sst_multi_db_b"); + auto staging = + dft_utils_test::make_unique_test_path("sst_multi_staging"); + fs::create_directories(root_a); + fs::create_directories(root_b); + fs::create_directories(staging); + + Fixture f1; + Fixture f2; + // Vary the second fixture so the comparison covers distinct data. + f2.read_lines = {7, 11, 13}; + f2.write_lines = {4}; + f2.pids = {201, 202}; + + IndexDatabase db_a((root_a / ".dftindex").string()); + { + auto w = db_a.begin_write(); + w->init_schema(); + f1.populate(*w, /*file_id=*/1); + f2.populate(*w, /*file_id=*/2); + w->commit(); + } + + IndexDatabase db_b((root_b / ".dftindex").string()); + { + auto w = db_b.begin_write(); + w->init_schema(); + w->commit(); + } + + SstArtifactRegistry registry; + { + IndexDatabaseSstWriterContext sst(staging.string(), "worker_0"); + f1.populate(sst, /*file_id=*/1); + registry.append(sst.commit()); + } + { + IndexDatabaseSstWriterContext sst(staging.string(), "worker_1"); + f2.populate(sst, /*file_id=*/2); + registry.append(sst.commit()); + } + + CHECK(registry.metadata().size() == 2); + CHECK(registry.checkpoints().size() == 2); + CHECK(registry.manifest().size() == 2); + + db_b.bulk_ingest(registry); + + db_a.rebuild_root_summaries(); + db_b.rebuild_root_summaries(); + + check_round_trip(db_a, db_b, 1); + check_round_trip(db_a, db_b, 2); + check_root_summaries(db_a, db_b); + } +} diff --git a/tests/utilities/reader/test_trace_reader.cpp b/tests/utilities/reader/test_trace_reader.cpp index e1a1ed0d..3eef79d3 100644 --- a/tests/utilities/reader/test_trace_reader.cpp +++ b/tests/utilities/reader/test_trace_reader.cpp @@ -70,6 +70,34 @@ static CoroTask> collect_lines( co_return lines; } +struct ParsedEvent { + std::string name; + std::string cat; + std::string ph; +}; + +static CoroTask> collect_json_events( + AsyncGenerator gen) { + std::vector events; + while (auto opt = co_await gen.next()) { + auto* p = opt->parser; + ParsedEvent ev; + if (auto v = p->get_string("name")) ev.name = std::string(*v); + if (auto v = p->get_string("cat")) ev.cat = std::string(*v); + if (auto v = p->get_string("ph")) ev.ph = std::string(*v); + events.push_back(std::move(ev)); + } + co_return events; +} + +static CoroTask count_json_lines(AsyncGenerator gen) { + std::size_t n = 0; + while (auto opt = co_await gen.next()) { + ++n; + } + co_return n; +} + } // namespace TEST_SUITE("TraceReader") { @@ -510,8 +538,8 @@ TEST_SUITE("TraceReader") { IndexBuilderUtility builder; auto build_result = builder .process(IndexBuildConfig::for_file(gz) - .with_bloom(true) - .with_index_threshold(0)) + + ) .get(); REQUIRE(build_result.success); @@ -559,8 +587,8 @@ TEST_SUITE("TraceReader") { IndexBuilderUtility builder; auto build_result = builder .process(IndexBuildConfig::for_file(gz) - .with_bloom(true) - .with_index_threshold(0)) + + ) .get(); REQUIRE(build_result.success); @@ -582,4 +610,247 @@ TEST_SUITE("TraceReader") { auto posix_bytes = count_raw_bytes(reader.read_raw(rc_posix)).get(); CHECK(posix_bytes == all_bytes); } + + TEST_CASE("Chunk pruning skips non-matching checkpoints") { + TestEnvironment env(100); + std::string pfw = env.get_dir() + "/multi_ckpt.pfw"; + constexpr int POSIX_BEFORE = 100; + constexpr int COMPUTE_COUNT = 5; + constexpr int POSIX_AFTER = 100; + constexpr int TOTAL = POSIX_BEFORE + COMPUTE_COUNT + POSIX_AFTER; + // Each line is ~550 bytes (padded args). 205 events * 550 = ~112KB. + // With 32KB checkpoint window -> 3-4 checkpoints. + // COMPUTE events cluster in one checkpoint in the middle. + std::string pad(400, 'x'); + { + std::ofstream out(pfw); + for (int i = 0; i < POSIX_BEFORE; ++i) { + out << R"({"ph":"X","name":"read","cat":"POSIX","pid":1,"tid":1,"ts":)" + << (1000 + i) << R"(,"dur":10,"args":{"pad":")" << pad + << R"("}})" << "\n"; + } + for (int i = 0; i < COMPUTE_COUNT; ++i) { + out << R"({"ph":"X","name":"train","cat":"COMPUTE","pid":2,"tid":2,"ts":)" + << (100000 + i) << R"(,"dur":500,"args":{"pad":")" << pad + << R"("}})" << "\n"; + } + for (int i = 0; i < POSIX_AFTER; ++i) { + out << R"({"ph":"X","name":"write","cat":"POSIX","pid":1,"tid":1,"ts":)" + << (200000 + i) << R"(,"dur":10,"args":{"pad":")" << pad + << R"("}})" << "\n"; + } + } + std::string gz = pfw + ".gz"; + REQUIRE(dft_utils_test::compress_file_to_gzip(pfw, gz)); + fs::remove(pfw); + + using dftracer::utils::utilities::indexer::IndexBuildConfig; + using dftracer::utils::utilities::indexer::IndexBuilderUtility; + IndexBuilderUtility builder; + auto build_result = builder + .process(IndexBuildConfig::for_file(gz) + .with_checkpoint_size(32 * 1024) + .with_manifest(true)) + .get(); + REQUIRE(build_result.success); + + TraceReader reader({.file_path = gz, .checkpoint_size = 32 * 1024}); + REQUIRE(reader.has_index()); + + auto all = count_lines(reader.read_lines()).get(); + REQUIRE(all == TOTAL); + + // Selective query: only COMPUTE events (5 out of 205) + ReadConfig rc_compute; + rc_compute.query = R"(cat == "COMPUTE")"; + auto compute_lines = collect_lines(reader.read_lines(rc_compute)).get(); + CHECK(compute_lines.size() == COMPUTE_COUNT); + for (const auto& line : compute_lines) { + CHECK(line.find("\"cat\":\"COMPUTE\"") != std::string::npos); + } + + // Full category query should still return all POSIX + ReadConfig rc_posix; + rc_posix.query = R"(cat == "POSIX")"; + auto posix_lines = collect_lines(reader.read_lines(rc_posix)).get(); + CHECK(posix_lines.size() == POSIX_BEFORE + POSIX_AFTER); + + // No match + ReadConfig rc_none; + rc_none.query = R"(cat == "NONEXISTENT")"; + auto none_lines = count_lines(reader.read_lines(rc_none)).get(); + CHECK(none_lines == 0); + } +} + +TEST_SUITE("TraceReader::read_json") { + TEST_CASE("read_json returns parsed events") { + TestEnvironment env(100); + std::string gz_file = env.create_dft_test_gzip_file(100); + TraceReader reader({.file_path = gz_file}); + + auto events = collect_json_events(reader.read_json()).get(); + CHECK(events.size() > 0); + for (const auto& ev : events) { + CHECK_FALSE(ev.ph.empty()); + } + } + + TEST_CASE("read_json count matches read_lines count") { + TestEnvironment env(100); + std::string gz_file = env.create_dft_test_gzip_file(100); + TraceReader reader({.file_path = gz_file}); + + auto line_count = count_lines(reader.read_lines()).get(); + auto json_count = count_json_lines(reader.read_json()).get(); + CHECK(json_count <= line_count); + CHECK(json_count > 0); + } + + TEST_CASE("read_json query filters events") { + TestEnvironment env(100); + std::string gz_file = env.create_dft_test_gzip_file(100); + TraceReader reader({.file_path = gz_file}); + + auto all = count_json_lines(reader.read_json()).get(); + REQUIRE(all > 0); + + ReadConfig rc; + rc.query = R"(cat == "POSIX")"; + auto events = collect_json_events(reader.read_json(rc)).get(); + CHECK(events.size() > 0); + CHECK(events.size() <= all); + for (const auto& ev : events) { + CHECK(ev.cat == "POSIX"); + } + } + + TEST_CASE("read_json query with no matches returns zero") { + TestEnvironment env(100); + std::string gz_file = env.create_dft_test_gzip_file(100); + TraceReader reader({.file_path = gz_file}); + + ReadConfig rc; + rc.query = R"(cat == "NONEXISTENT")"; + auto n = count_json_lines(reader.read_json(rc)).get(); + CHECK(n == 0); + } + + TEST_CASE("read_json with AND query") { + TestEnvironment env(100); + std::string gz_file = env.create_dft_test_gzip_file(100); + TraceReader reader({.file_path = gz_file}); + + ReadConfig rc; + rc.query = R"(cat == "POSIX" and name == "read")"; + auto events = collect_json_events(reader.read_json(rc)).get(); + CHECK(events.size() > 0); + for (const auto& ev : events) { + CHECK(ev.cat == "POSIX"); + CHECK(ev.name == "read"); + } + } + + TEST_CASE("read_json matches read_lines query count") { + TestEnvironment env(100); + std::string pfw = env.get_dir() + "/json_vs_lines.pfw"; + { + std::ofstream out(pfw); + for (int i = 0; i < 100; ++i) { + out << R"({"ph":"X","name":"read","cat":"POSIX","pid":1,"tid":1,"ts":)" + << (1000 + i) << R"(,"dur":10,"args":{}})" << "\n"; + } + for (int i = 0; i < 50; ++i) { + out << R"({"ph":"X","name":"train","cat":"COMPUTE","pid":2,"tid":2,"ts":)" + << (100000 + i) << R"(,"dur":500,"args":{}})" << "\n"; + } + } + + TraceReader reader({.file_path = pfw}); + + ReadConfig rc; + rc.query = R"(cat == "POSIX")"; + auto line_count = count_lines(reader.read_lines(rc)).get(); + auto json_count = count_json_lines(reader.read_json(rc)).get(); + CHECK(line_count == json_count); + CHECK(json_count == 100); + + fs::remove(pfw); + } + + TEST_CASE("read_json works with index and chunk pruning") { + TestEnvironment env(100); + std::string pfw = env.get_dir() + "/json_indexed.pfw"; + std::string pad(400, 'x'); + { + std::ofstream out(pfw); + for (int i = 0; i < 100; ++i) { + out << R"({"ph":"X","name":"read","cat":"POSIX","pid":1,"tid":1,"ts":)" + << (1000 + i) << R"(,"dur":10,"args":{"pad":")" << pad + << R"("}})" << "\n"; + } + for (int i = 0; i < 5; ++i) { + out << R"({"ph":"X","name":"train","cat":"COMPUTE","pid":2,"tid":2,"ts":)" + << (100000 + i) << R"(,"dur":500,"args":{"pad":")" << pad + << R"("}})" << "\n"; + } + for (int i = 0; i < 100; ++i) { + out << R"({"ph":"X","name":"write","cat":"POSIX","pid":1,"tid":1,"ts":)" + << (200000 + i) << R"(,"dur":10,"args":{"pad":")" << pad + << R"("}})" << "\n"; + } + } + std::string gz = pfw + ".gz"; + REQUIRE(dft_utils_test::compress_file_to_gzip(pfw, gz)); + fs::remove(pfw); + + using dftracer::utils::utilities::indexer::IndexBuildConfig; + using dftracer::utils::utilities::indexer::IndexBuilderUtility; + IndexBuilderUtility builder; + auto build_result = builder + .process(IndexBuildConfig::for_file(gz) + .with_checkpoint_size(32 * 1024) + .with_manifest(true)) + .get(); + REQUIRE(build_result.success); + + TraceReader reader({.file_path = gz, .checkpoint_size = 32 * 1024}); + REQUIRE(reader.has_index()); + + ReadConfig rc; + rc.query = R"(cat == "COMPUTE")"; + auto events = collect_json_events(reader.read_json(rc)).get(); + CHECK(events.size() == 5); + for (const auto& ev : events) { + CHECK(ev.cat == "COMPUTE"); + } + + ReadConfig rc_posix; + rc_posix.query = R"(cat == "POSIX")"; + auto posix_count = count_json_lines(reader.read_json(rc_posix)).get(); + CHECK(posix_count == 200); + + ReadConfig rc_none; + rc_none.query = R"(cat == "NONEXISTENT")"; + auto none_count = count_json_lines(reader.read_json(rc_none)).get(); + CHECK(none_count == 0); + } + + TEST_CASE("read_json parser fields are accessible") { + auto test_file = make_unique_test_path("json_parser_fields.pfw"); + { + std::ofstream out(test_file); + out << R"({"ph":"X","name":"read","cat":"POSIX","pid":42,"tid":7,"ts":1000,"dur":10,"args":{"ret":1}})" + << "\n"; + } + + TraceReader reader({.file_path = test_file.string()}); + auto events = collect_json_events(reader.read_json()).get(); + REQUIRE(events.size() == 1); + CHECK(events[0].name == "read"); + CHECK(events[0].cat == "POSIX"); + CHECK(events[0].ph == "X"); + + fs::remove(test_file); + } }