From 3fdd0834d599529f5c764679d2102cbda478a566 Mon Sep 17 00:00:00 2001 From: Ray Andrew Date: Mon, 6 Apr 2026 00:31:20 -0500 Subject: [PATCH] feat(rocksdb): migrate SQLite indexing to RocksDB Replace SQLite-backed indexing and provenance storage with RocksDB-backed stores. Key changes: - add RocksDB async/database/db-manager/filesystem/key-codec layers - migrate index and provenance databases from SQLite to RocksDB - update index builder, trace reader, reorganize, view, stats, and comparator paths for RocksDB - harden transaction atomicity and rollback behavior with TransactionScope - add iterator status checking for prefix scans - harden gzip/tar indexer cache state and metadata handling - capture executor context in RocksDB awaitables - clean up failed RocksDB open paths and manager lifecycle behavior - vendor CPM 0.42.1 and update CI/build integration - refresh docs, Python bindings, and C++/Python test coverage for the new backend Validation: - full test suite passed - Ubuntu 22.04 Docker run passed - focused RocksDB/indexer regression tests passed. --- .github/workflows/ci.yml | 44 +- .github/workflows/format-check.yaml | 10 +- .github/workflows/python-publish.yaml | 14 +- .readthedocs.yaml | 8 - CMakeLists.txt | 12 + Makefile | 11 +- cmake/modules/CPM.cmake | 26 +- cmake/modules/Dependencies.cmake | 510 +++-- cmake/modules/InstallHelpers.cmake | 36 - cmake/modules/LibraryHelpers.cmake | 4 + cmake/vendor/CPM_0.42.1.cmake | 1363 ++++++++++++++ docs/Makefile | 2 + docs/scripts/generate_api_index.py | 167 +- docs/source/_static/custom.css | 19 + docs/source/api/indexer.rst | 5 +- docs/source/api/trace_reader.rst | 5 +- docs/source/conf.py | 837 ++++++++- docs/source/cpp_api/coro.rst | 24 +- docs/source/cpp_api/index.rst | 9 +- docs/source/cpp_api/pipeline/executors.rst | 7 +- docs/source/cpp_api/rocksdb.rst | 35 + docs/source/cpp_api/sqlite.rst | 337 ---- docs/source/installation.rst | 5 +- docs/source/quickstart.rst | 2 +- docs/source/utilities/indexer.rst | 7 +- .../dftracer/utils/core/common/constants.h | 4 +- .../dftracer/utils/core/common/scoped_fd.h | 44 + include/dftracer/utils/core/env.h | 35 + include/dftracer/utils/core/io/io_backend.h | 8 + .../dftracer/utils/core/pipeline/executor.h | 12 +- .../utils/core/pipeline/pipeline_config.h | 8 +- include/dftracer/utils/core/rocksdb/async.h | 130 ++ .../dftracer/utils/core/rocksdb/database.h | 82 + .../dftracer/utils/core/rocksdb/db_manager.h | 40 + .../dftracer/utils/core/rocksdb/filesystem.h | 19 + .../dftracer/utils/core/rocksdb/key_codec.h | 39 + include/dftracer/utils/core/runtime.h | 4 + include/dftracer/utils/core/sqlite/async.h | 112 -- include/dftracer/utils/core/sqlite/database.h | 36 - include/dftracer/utils/core/sqlite/error.h | 31 - .../dftracer/utils/core/sqlite/statement.h | 65 - include/dftracer/utils/core/sqlite/vfs.h | 44 - include/dftracer/utils/server/trace_index.h | 6 +- .../aggregators/chunk_aggregator_utility.h | 6 +- .../composites/dft/chunk_extractor_utility.h | 2 +- .../dft/comparator/comparison_config.h | 2 +- .../composites/dft/indexing/bloom_filter.h | 2 +- .../dft/indexing/bloom_filter_cache.h | 18 +- .../dft/indexing/chunk_dimension_stats.h | 2 +- .../dft/indexing/chunk_indexer_utility.h | 6 +- .../dft/indexing/chunk_pruner_utility.h | 2 +- .../dft/indexing/chunk_statistics.h | 3 +- .../composites/dft/internal/chunk_spec.h | 2 +- .../utilities/composites/dft/internal/utils.h | 21 +- .../dft/metadata_collector_utility.h | 15 +- .../dft/reorganize/provenance_tracker.h | 10 +- .../dft/reorganize/reorganization_planner.h | 2 +- .../statistics/chunk_detail_scanner_utility.h | 2 +- .../composites/dft/statistics/statistics.h | 3 +- .../statistics_aggregator_utility.h | 2 +- .../dft/statistics/trace_statistics.h | 2 +- .../dft/views/view_builder_utility.h | 6 +- .../dft/views/view_reader_utility.h | 4 +- .../composites/file_merger_utility.h | 5 +- .../composites/indexed_file_reader_utility.h | 43 +- .../composites/line_batch_processor_utility.h | 10 +- .../utils/utilities/composites/types.h | 13 +- .../utilities/fileio/lines/line_bytes_range.h | 2 +- .../utils/utilities/fileio/lines/line_types.h | 13 +- .../async_plain_file_bytes_generator.h | 14 +- .../sources/async_plain_file_line_generator.h | 9 +- .../async_streaming_gz_line_generator.h | 9 +- .../fileio/lines/streaming_line_reader.h | 47 +- .../utils/utilities/fileio/types/chunk_spec.h | 8 +- .../utilities/indexer/index_builder_utility.h | 2 +- .../utils/utilities/indexer/index_database.h | 71 +- .../utilities/indexer/internal/indexer.h | 4 +- .../indexer/internal/indexer_factory.h | 6 +- .../utilities/indexer/internal/scan_prefix.h | 38 + .../utilities/indexer/provenance_database.h | 44 +- .../utils/utilities/reader/internal/reader.h | 5 +- .../reader/internal/reader_factory.h | 2 +- .../utils/utilities/reader/trace_reader.h | 6 +- python/dftracer/utils/dftracer_utils_ext.pyi | 35 +- setup.py | 1 - src/CMakeLists.txt | 80 +- .../utils/binaries/dftracer_aggregator.cpp | 22 +- .../utils/binaries/dftracer_comparator.cpp | 19 +- .../utils/binaries/dftracer_event_count.cpp | 9 +- .../binaries/dftracer_gen_fake_trace.cpp | 6 +- .../utils/binaries/dftracer_index.cpp | 7 +- src/dftracer/utils/binaries/dftracer_info.cpp | 23 +- .../utils/binaries/dftracer_organize.cpp | 14 +- .../utils/binaries/dftracer_reader.cpp | 55 +- .../utils/binaries/dftracer_reconstruct.cpp | 6 +- .../utils/binaries/dftracer_server.cpp | 4 +- .../utils/binaries/dftracer_split.cpp | 22 +- .../utils/binaries/dftracer_stats.cpp | 41 +- src/dftracer/utils/binaries/dftracer_tar.cpp | 10 +- src/dftracer/utils/binaries/dftracer_view.cpp | 29 +- src/dftracer/utils/core/env.cpp | 57 + .../core/io/epoll_thread_pool_backend.cpp | 25 +- .../utils/core/io/epoll_thread_pool_backend.h | 3 + .../utils/core/io/io_backend_sync.cpp | 4 +- .../utils/core/io/io_uring_backend.cpp | 66 +- src/dftracer/utils/core/io/io_uring_backend.h | 7 + .../core/io/kqueue_thread_pool_backend.cpp | 25 +- .../core/io/kqueue_thread_pool_backend.h | 3 + .../utils/core/io/thread_pool_backend.cpp | 26 +- .../utils/core/io/thread_pool_backend.h | 5 + src/dftracer/utils/core/pipeline/executor.cpp | 23 +- src/dftracer/utils/core/pipeline/pipeline.cpp | 2 +- src/dftracer/utils/core/rocksdb/async.cpp | 32 + src/dftracer/utils/core/rocksdb/database.cpp | 275 +++ .../utils/core/rocksdb/db_manager.cpp | 143 ++ .../utils/core/rocksdb/filesystem.cpp | 849 +++++++++ src/dftracer/utils/core/rocksdb/key_codec.cpp | 88 + src/dftracer/utils/core/runtime.cpp | 4 + src/dftracer/utils/core/sqlite/async.cpp | 32 - src/dftracer/utils/core/sqlite/database.cpp | 85 - src/dftracer/utils/core/sqlite/error.cpp | 25 - src/dftracer/utils/core/sqlite/statement.cpp | 175 -- src/dftracer/utils/core/sqlite/vfs.cpp | 620 ------- src/dftracer/utils/python/indexer.cpp | 95 +- src/dftracer/utils/python/indexer.h | 2 +- src/dftracer/utils/python/trace_reader.cpp | 31 +- .../utils/python/trace_reader_iterator.cpp | 24 +- .../utils/python/trace_reader_iterator.h | 3 + .../utils/python/utilities/aggregator.cpp | 4 +- .../utils/python/utilities/comparator.cpp | 20 +- .../python/utilities/metadata_collector.cpp | 11 +- .../utilities/reconstruction_planner.cpp | 4 +- .../utilities/reorganization_planner.cpp | 6 +- .../utilities/statistics_aggregator.cpp | 6 +- .../python/utilities/statistics_query.cpp | 6 +- src/dftracer/utils/server/trace_api.cpp | 11 +- src/dftracer/utils/server/trace_index.cpp | 63 +- src/dftracer/utils/server/viz_api.cpp | 16 +- .../call_tree/call_tree_internal.cpp | 11 +- .../utilities/call_tree/call_tree_mpi.cpp | 17 +- .../dft/aggregators/aggregator_utility.cpp | 13 +- .../aggregators/chunk_aggregator_utility.cpp | 4 +- .../dft/aggregators/chunk_mapper_utility.cpp | 4 +- .../dft/chunk_extractor_utility.cpp | 6 +- .../dft/chunk_manifest_mapper_utility.cpp | 2 +- .../dft/event_collector_utility.cpp | 4 +- .../dft/indexing/chunk_indexer_utility.cpp | 2 +- .../dft/indexing/chunk_pruner_utility.cpp | 30 +- .../dft/indexing/chunk_statistics.cpp | 12 +- .../queries/delete_chunk_bloom_filters.cpp | 29 - .../queries/delete_chunk_dimension_stats.cpp | 23 - .../queries/delete_chunk_statistics.cpp | 23 - .../indexing/queries/delete_event_ranges.cpp | 24 - .../queries/delete_file_bloom_filter.cpp | 29 - .../queries/delete_hash_resolutions.cpp | 23 - .../queries/delete_metadata_lines.cpp | 24 - .../queries/insert_chunk_bloom_filter.cpp | 75 - .../queries/insert_chunk_dimension_stats.cpp | 84 - .../queries/insert_chunk_statistics.cpp | 86 - .../indexing/queries/insert_event_range.cpp | 65 - .../queries/insert_file_bloom_filter.cpp | 42 - .../queries/insert_hash_resolution.cpp | 58 - .../queries/insert_index_dimension.cpp | 29 - .../queries/insert_metadata_lines.cpp | 47 - .../indexing/queries/insert_provenance.cpp | 95 - .../dft/indexing/queries/manifest_queries.h | 103 +- .../composites/dft/indexing/queries/queries.h | 130 -- .../queries/query_chunk_bloom_filters.cpp | 45 - .../query_chunk_bloom_filters_batch.cpp | 58 - .../queries/query_chunk_dimension_stats.cpp | 119 -- .../queries/query_chunk_statistics.cpp | 142 -- .../indexing/queries/query_event_ranges.cpp | 72 - .../queries/query_file_bloom_filter.cpp | 40 - .../query_file_bloom_filters_batch.cpp | 55 - .../queries/query_hash_by_resolved.cpp | 32 - .../queries/query_index_dimensions.cpp | 41 - .../indexing/queries/query_metadata_lines.cpp | 66 - .../dft/indexing/queries/query_provenance.cpp | 117 -- .../queries/query_resolved_by_hash.cpp | 32 - .../indexing/queries/query_time_bounds.cpp | 36 - .../composites/dft/internal/utils.cpp | 20 +- .../dft/metadata_collector_utility.cpp | 36 +- .../dft/reorganize/event_router.cpp | 12 +- .../dft/reorganize/provenance_tracker.cpp | 70 +- .../dft/reorganize/reconstruction_planner.cpp | 13 +- .../dft/reorganize/reorganization_planner.cpp | 126 +- .../chunk_detail_scanner_utility.cpp | 2 +- .../statistics_aggregator_utility.cpp | 27 +- .../dft/statistics/trace_statistics.cpp | 2 +- .../dft/views/view_builder_utility.cpp | 20 +- .../dft/views/view_reader_utility.cpp | 6 +- .../composites/file_merger_utility.cpp | 8 +- .../indexer/index_builder_utility.cpp | 69 +- .../utilities/indexer/index_database.cpp | 1645 +++++++++++++---- .../indexer/internal/checkpoint_size.h | 2 +- .../indexer/internal/gzip/gzip_indexer.cpp | 434 +++-- .../indexer/internal/gzip/gzip_indexer.h | 14 +- .../gzip/queries/delete_file_record.cpp | 37 - .../gzip/queries/insert_checkpoint_record.cpp | 36 - .../queries/insert_file_metadata_record.cpp | 33 - .../gzip/queries/insert_file_record.cpp | 35 - .../indexer/internal/gzip/queries/queries.h | 66 - .../gzip/queries/query_checkpoint.cpp | 76 - .../gzip/queries/query_checkpoint_size.cpp | 19 - .../gzip/queries/query_checkpoints.cpp | 105 -- .../internal/gzip/queries/query_file_id.cpp | 19 - .../internal/gzip/queries/query_max_bytes.cpp | 37 - .../internal/gzip/queries/query_num_lines.cpp | 21 - .../gzip/queries/query_schema_validity.cpp | 19 - .../gzip/queries/query_stored_file_info.cpp | 29 - .../utilities/indexer/internal/helpers.cpp | 21 +- .../utilities/indexer/internal/helpers.h | 3 +- .../utilities/indexer/internal/indexer_c.cpp | 6 +- .../indexer/internal/indexer_factory.cpp | 20 +- .../indexer/internal/sqlite/database.h | 13 - .../indexer/internal/sqlite/statement.h | 13 - .../insert_archive_metadata_record.cpp | 32 - .../tar/queries/insert_archive_record.cpp | 35 - .../tar/queries/insert_file_record.cpp | 35 - .../queries/insert_tar_checkpoint_record.cpp | 37 - .../tar/queries/insert_tar_file_record.cpp | 29 - .../indexer/internal/tar/queries/queries.h | 108 -- .../internal/tar/queries/query_archive_id.cpp | 28 - .../internal/tar/queries/query_metadata.cpp | 143 -- .../tar/queries/query_tar_checkpoints.cpp | 165 -- .../internal/tar/queries/query_tar_files.cpp | 119 -- .../indexer/internal/tar/tar_indexer.cpp | 675 ++++--- .../indexer/internal/tar/tar_indexer.h | 26 +- .../indexer/internal/transaction_scope.h | 39 + .../utilities/indexer/provenance_database.cpp | 496 +++-- .../indexer/visitors/bloom_visitor.cpp | 34 +- .../indexer/visitors/manifest_visitor.cpp | 32 +- .../utilities/reader/internal/gzip_reader.cpp | 16 +- .../utilities/reader/internal/gzip_reader.h | 6 +- .../utilities/reader/internal/reader_c.cpp | 10 +- .../reader/internal/reader_factory.cpp | 6 +- .../utilities/reader/internal/tar_reader.cpp | 36 +- .../utilities/reader/internal/tar_reader.h | 6 +- .../utils/utilities/reader/trace_reader.cpp | 20 +- .../utils/utilities/replay/replay.cpp | 4 +- tests/CMakeLists.txt | 3 + tests/binaries/test_dftracer_index.cpp | 44 +- tests/binaries/test_dftracer_info.cpp | 10 + tests/binaries/test_dftracer_organize.cpp | 26 +- tests/binaries/test_dftracer_server.cpp | 35 +- tests/binaries/test_dftracer_tar.cpp | 26 +- tests/python/common.py | 36 +- tests/python/test_dask.py | 89 +- tests/python/test_indexer.py | 139 +- tests/python/test_reorganization_planner.py | 16 +- tests/python/test_statistics_aggregator.py | 28 +- tests/python/test_statistics_query.py | 32 +- tests/python/test_trace_reader.py | 22 +- tests/python/test_trace_reader_arrow.py | 52 +- tests/reader/test_basic_factory.cpp | 12 +- tests/reader/test_reader.c | 4 +- tests/reader/test_reader.cpp | 4 +- tests/reader/test_reader_formats.cpp | 3 +- tests/reader/test_reader_stream.cpp | 2 +- .../reader/test_reader_tar_comprehensive.cpp | 8 +- tests/testing_utilities.cpp | 11 +- tests/testing_utilities.h | 2 +- tests/utilities/CMakeLists.txt | 2 + .../dft/indexing/test_bloom_query.cpp | 101 +- .../dft/indexing/test_chunk_indexer.cpp | 6 +- .../dft/indexing/test_chunk_pruner.cpp | 94 +- .../indexing/test_manifest_index_builder.cpp | 8 +- .../dft/indexing/test_manifest_indexer.cpp | 6 +- .../dft/indexing/test_manifest_queries.cpp | 66 +- .../test_reconstruct_integration.cpp | 206 ++- .../test_reconstruction_planner.cpp | 50 +- .../test_reorganization_planner.cpp | 26 +- .../test_reorganize_integration.cpp | 22 +- .../statistics/test_statistics_aggregator.cpp | 53 +- .../dft/statistics/test_statistics_query.cpp | 2 +- .../dft/statistics/test_trace_statistics.cpp | 4 +- .../composites/dft/test_index_builder.cpp | 10 +- .../dft/test_metadata_collector.cpp | 19 +- .../dft/views/test_view_builder.cpp | 88 +- .../composites/dft/views/test_view_reader.cpp | 11 +- .../composites/test_indexed_file_reader.cpp | 91 +- .../composites/test_line_batch_processor.cpp | 12 +- .../lines/test_streaming_line_reader.cpp | 94 +- .../utilities/indexer/test_index_builder.cpp | 22 +- .../utilities/indexer/test_index_database.cpp | 648 +------ .../indexer/test_provenance_database.cpp | 244 +-- .../indexer/test_rocksdb_storage.cpp | 224 +++ tests/utilities/indexer/test_scan_prefix.cpp | 123 ++ tests/utilities/reader/test_trace_reader.cpp | 34 +- 289 files changed, 9809 insertions(+), 8201 deletions(-) create mode 100644 cmake/vendor/CPM_0.42.1.cmake create mode 100644 docs/source/_static/custom.css create mode 100644 docs/source/cpp_api/rocksdb.rst delete mode 100644 docs/source/cpp_api/sqlite.rst create mode 100644 include/dftracer/utils/core/common/scoped_fd.h create mode 100644 include/dftracer/utils/core/env.h create mode 100644 include/dftracer/utils/core/rocksdb/async.h create mode 100644 include/dftracer/utils/core/rocksdb/database.h create mode 100644 include/dftracer/utils/core/rocksdb/db_manager.h create mode 100644 include/dftracer/utils/core/rocksdb/filesystem.h create mode 100644 include/dftracer/utils/core/rocksdb/key_codec.h delete mode 100644 include/dftracer/utils/core/sqlite/async.h delete mode 100644 include/dftracer/utils/core/sqlite/database.h delete mode 100644 include/dftracer/utils/core/sqlite/error.h delete mode 100644 include/dftracer/utils/core/sqlite/statement.h delete mode 100644 include/dftracer/utils/core/sqlite/vfs.h create mode 100644 include/dftracer/utils/utilities/indexer/internal/scan_prefix.h create mode 100644 src/dftracer/utils/core/env.cpp create mode 100644 src/dftracer/utils/core/rocksdb/async.cpp create mode 100644 src/dftracer/utils/core/rocksdb/database.cpp create mode 100644 src/dftracer/utils/core/rocksdb/db_manager.cpp create mode 100644 src/dftracer/utils/core/rocksdb/filesystem.cpp create mode 100644 src/dftracer/utils/core/rocksdb/key_codec.cpp delete mode 100644 src/dftracer/utils/core/sqlite/async.cpp delete mode 100644 src/dftracer/utils/core/sqlite/database.cpp delete mode 100644 src/dftracer/utils/core/sqlite/error.cpp delete mode 100644 src/dftracer/utils/core/sqlite/statement.cpp delete mode 100644 src/dftracer/utils/core/sqlite/vfs.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_bloom_filters.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_dimension_stats.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_statistics.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_event_ranges.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_file_bloom_filter.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_hash_resolutions.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_metadata_lines.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_bloom_filter.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_dimension_stats.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_statistics.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_event_range.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_file_bloom_filter.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_hash_resolution.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_index_dimension.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_metadata_lines.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_provenance.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters_batch.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_dimension_stats.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_statistics.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_event_ranges.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filter.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filters_batch.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_hash_by_resolved.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_index_dimensions.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_metadata_lines.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_provenance.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_resolved_by_hash.cpp delete mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/queries/query_time_bounds.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/delete_file_record.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_checkpoint_record.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_metadata_record.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_record.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/queries.h delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint_size.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoints.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_file_id.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_max_bytes.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_num_lines.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_schema_validity.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_stored_file_info.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/sqlite/database.h delete mode 100644 src/dftracer/utils/utilities/indexer/internal/sqlite/statement.h delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_metadata_record.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_record.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_file_record.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_checkpoint_record.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_file_record.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/queries.h delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/query_archive_id.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/query_metadata.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_checkpoints.cpp delete mode 100644 src/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_files.cpp create mode 100644 src/dftracer/utils/utilities/indexer/internal/transaction_scope.h create mode 100644 tests/utilities/indexer/test_rocksdb_storage.cpp create mode 100644 tests/utilities/indexer/test_scan_prefix.cpp diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 41c000e6..4ef4b04e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,8 +13,8 @@ jobs: outputs: code: ${{ steps.filter.outputs.code }} steps: - - uses: actions/checkout@v4 - - uses: dorny/paths-filter@v3 + - uses: actions/checkout@v6 + - uses: dorny/paths-filter@v3.0.2 id: filter with: filters: | @@ -44,24 +44,33 @@ jobs: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v6.1.0 with: python-version: ${{ matrix.python-version }} + + - name: Cache ccache + uses: actions/cache@v5 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('CMakeLists.txt', 'pyproject.toml', '.github/workflows/ci.yml') }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.python-version }}- + ccache-${{ runner.os }}- - name: Install dependencies (Ubuntu) if: runner.os == 'Linux' run: | sudo apt-get update - sudo apt-get install -y build-essential cmake lcov zlib1g-dev libsqlite3-dev pkg-config ninja-build + sudo apt-get install -y build-essential cmake ccache lcov zlib1g-dev libsqlite3-dev pkg-config ninja-build - name: Install dependencies (macOS) if: runner.os == 'macOS' run: | brew update - for f in cmake lcov zlib sqlite pkg-config ninja; do + for f in cmake ccache lcov zlib sqlite pkg-config ninja; do if brew list --versions "$f" >/dev/null; then echo "$f already installed" else @@ -80,24 +89,17 @@ jobs: make test - name: Run Python tests (with venv) + if: "!((matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12')" run: | - make test-py - - - name: Run Python tests (without venv) - run: | - pip install --upgrade pip setuptools wheel - pip install -e ".[dev]" - pytest tests/python -v - - - name: Type check (ty) - if: matrix.python-version == '3.12' && runner.os == 'Linux' - run: | - pip install ty - ty check --python "$(which python)" python/ + if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.python-version }}" = "3.12" ]; then + make test-py RUN_TY=1 + else + make test-py + fi - name: Upload coverage reports to Coveralls if: (matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12' - uses: coverallsapp/github-action@v2 + uses: coverallsapp/github-action@v2.3.6 continue-on-error: true with: file: coverage/coverage_filtered.info @@ -113,7 +115,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Coveralls finished - uses: coverallsapp/github-action@v2 + uses: coverallsapp/github-action@v2.3.6 continue-on-error: true with: parallel-finished: true diff --git a/.github/workflows/format-check.yaml b/.github/workflows/format-check.yaml index 5e2c4a64..1e93e8d8 100644 --- a/.github/workflows/format-check.yaml +++ b/.github/workflows/format-check.yaml @@ -13,8 +13,8 @@ jobs: cpp: ${{ steps.filter.outputs.cpp }} python: ${{ steps.filter.outputs.python }} steps: - - uses: actions/checkout@v4 - - uses: dorny/paths-filter@v3 + - uses: actions/checkout@v6 + - uses: dorny/paths-filter@v3.0.2 id: filter with: filters: | @@ -35,7 +35,7 @@ jobs: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install clang-format run: | @@ -54,10 +54,10 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v8.0.0 - name: Ruff check run: uvx ruff check python/ tests/python/ diff --git a/.github/workflows/python-publish.yaml b/.github/workflows/python-publish.yaml index 93bcd90b..59e2388e 100644 --- a/.github/workflows/python-publish.yaml +++ b/.github/workflows/python-publish.yaml @@ -24,15 +24,15 @@ jobs: os: [ubuntu-22.04, macos-14] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 - name: Build wheels - uses: pypa/cibuildwheel@v3.2.1 + uses: pypa/cibuildwheel@v3.3.0 env: CIBW_BUILD: cp38-* cp39-* cp310-* cp311-* cp312-* cp313-* cp314-* CIBW_SKIP: "*-win32 *-manylinux_i686 *-musllinux_* *-manylinux_aarch64 *-manylinux_ppc64le *-manylinux_s390x" - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v6.0.0 with: name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} path: ./wheelhouse/*.whl @@ -41,12 +41,12 @@ jobs: name: Build source distribution runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6.1.0 with: python-version: '3.12' @@ -58,7 +58,7 @@ jobs: - name: Build sdist run: python -m build --sdist - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v6.0.0 with: name: cibw-sdist path: dist/*.tar.gz @@ -70,7 +70,7 @@ jobs: if: github.event_name == 'release' && github.event.action == 'published' steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v8 with: pattern: cibw-* path: dist diff --git a/.readthedocs.yaml b/.readthedocs.yaml index ac5df96f..d1b42af6 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,14 +10,8 @@ build: python: "3.11" apt_packages: - doxygen - - cmake - - build-essential - - zlib1g-dev - - libsqlite3-dev - - pkg-config jobs: pre_build: - # Run Doxygen to generate C++ API documentation - cd docs && doxygen Doxyfile # Build documentation in the "docs/" directory with Sphinx @@ -35,5 +29,3 @@ formats: python: install: - requirements: docs/requirements.txt - - method: pip - path: . diff --git a/CMakeLists.txt b/CMakeLists.txt index f8fb7e3d..238c0f41 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,18 @@ project( VERSION ${DFTRACER_UTILS_VERSION} LANGUAGES C CXX) +find_program(CCACHE_EXECUTABLE ccache) +if(CCACHE_EXECUTABLE) + foreach(lang C CXX ASM) + if(NOT CMAKE_${lang}_COMPILER_LAUNCHER) + set(CMAKE_${lang}_COMPILER_LAUNCHER + "${CCACHE_EXECUTABLE}" + CACHE STRING "Compiler launcher for ${lang}" FORCE) + endif() + endforeach() + message(STATUS "Using ccache: ${CCACHE_EXECUTABLE}") +endif() + set(DFTRACER_UTILS_PACKAGE ${PROJECT_NAME}) set(DFTRACER_UTILS_PACKAGE_NAME ${PROJECT_NAME}) set(DFTRACER_UTILS_PACKAGE_VERSION "${PROJECT_VERSION}") diff --git a/Makefile b/Makefile index 4e3504f5..e9c85b81 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,7 @@ .PHONY: coverage coverage-clean coverage-view coverage-open test test-coverage test-py build clean format check-format cmake-format lint typecheck help +RUN_TY ?= 0 + # Detect build system BUILD_GENERATOR := $(shell command -v ninja >/dev/null 2>&1 && echo "Ninja" || echo "Unix Makefiles") BUILD_TOOL := $(shell command -v ninja >/dev/null 2>&1 && echo "ninja" || echo "make") @@ -65,8 +67,15 @@ test-py: @rm -rf .venv_test_py @python3 -m venv .venv_test_py @.venv_test_py/bin/pip install --upgrade pip setuptools wheel - @.venv_test_py/bin/pip install -e .[dev] + @if [ "$(RUN_TY)" = "1" ]; then \ + .venv_test_py/bin/pip install -e .[dev] ty; \ + else \ + .venv_test_py/bin/pip install -e .[dev]; \ + fi @.venv_test_py/bin/pytest tests/python -v + @if [ "$(RUN_TY)" = "1" ]; then \ + .venv_test_py/bin/ty check --python "$$(pwd)/.venv_test_py/bin/python" python/; \ + fi @rm -rf .venv_test_py @echo "Python tests completed successfully!" diff --git a/cmake/modules/CPM.cmake b/cmake/modules/CPM.cmake index e88873c8..97442324 100644 --- a/cmake/modules/CPM.cmake +++ b/cmake/modules/CPM.cmake @@ -2,9 +2,11 @@ # # SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors -set(CPM_DOWNLOAD_VERSION 0.42.0) +set(CPM_DOWNLOAD_VERSION 0.42.1) set(CPM_HASH_SUM - "2020b4fc42dba44817983e06342e682ecfc3d2f484a581f11cc5731fbe4dce8a") + "f3a6dcc6a04ce9e7f51a127307fa4f699fb2bade357a8eb4c5b45df76e1dc6a5") +set(CPM_VENDORED_LOCATION + "${CMAKE_CURRENT_LIST_DIR}/../vendor/CPM_${CPM_DOWNLOAD_VERSION}.cmake") if(CPM_SOURCE_CACHE) set(CPM_DOWNLOAD_LOCATION @@ -21,10 +23,28 @@ endif() # (~) get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) +if(EXISTS "${CPM_VENDORED_LOCATION}") + get_filename_component(CPM_DOWNLOAD_DIR "${CPM_DOWNLOAD_LOCATION}" DIRECTORY) + file(MAKE_DIRECTORY "${CPM_DOWNLOAD_DIR}") + file(COPY_FILE "${CPM_VENDORED_LOCATION}" "${CPM_DOWNLOAD_LOCATION}" ONLY_IF_DIFFERENT) + include("${CPM_DOWNLOAD_LOCATION}") + return() +endif() + file( DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake ${CPM_DOWNLOAD_LOCATION} - EXPECTED_HASH SHA256=${CPM_HASH_SUM}) + EXPECTED_HASH SHA256=${CPM_HASH_SUM} + STATUS CPM_DOWNLOAD_STATUS) + +list(GET CPM_DOWNLOAD_STATUS 0 CPM_DOWNLOAD_STATUS_CODE) +if(NOT CPM_DOWNLOAD_STATUS_CODE EQUAL 0) + list(GET CPM_DOWNLOAD_STATUS 1 CPM_DOWNLOAD_STATUS_MESSAGE) + message( + FATAL_ERROR + "Failed to download CPM.cmake v${CPM_DOWNLOAD_VERSION}: ${CPM_DOWNLOAD_STATUS_MESSAGE}. " + "Either restore network access or vendor the file at ${CPM_VENDORED_LOCATION}.") +endif() include(${CPM_DOWNLOAD_LOCATION}) diff --git a/cmake/modules/Dependencies.cmake b/cmake/modules/Dependencies.cmake index b99680eb..2c5ccbfc 100644 --- a/cmake/modules/Dependencies.cmake +++ b/cmake/modules/Dependencies.cmake @@ -430,216 +430,225 @@ function(need_yyjson) endif() endfunction() -# ============================================================================== -# Database Dependencies -# ============================================================================== - -function(need_sqlite3) - find_package(SQLite3 3.35 QUIET) - - if(SQLite3_FOUND) - message(STATUS "Found system SQLite3: ${SQLite3_LIBRARIES}") +# Function to find or build RocksDB +function(need_rocksdb) + find_package(RocksDB 10.10.1 QUIET CONFIG) + if(NOT RocksDB_FOUND) + find_package(rocksdb 10.10.1 QUIET CONFIG) + endif() + if(NOT RocksDB_FOUND AND rocksdb_FOUND) + set(RocksDB_FOUND TRUE) + endif() + if(NOT RocksDB_FOUND) + find_package(RocksDB 10.10.1 QUIET) + endif() - # Prefer the modern target name (SQLite3::SQLite3). - # Older CMake versions only provide SQLite::SQLite3 (now deprecated). - if(NOT TARGET SQLite3::SQLite3) - if(TARGET SQLite::SQLite3) - # Wrap the deprecated target - add_library(SQLite3::SQLite3 ALIAS SQLite::SQLite3) - else() - add_library(SQLite3::SQLite3 UNKNOWN IMPORTED) + if(RocksDB_FOUND) + message(STATUS "Found system RocksDB") + + if(NOT TARGET RocksDB::rocksdb) + if(TARGET rocksdb) + add_library(RocksDB::rocksdb ALIAS rocksdb) + elseif(TARGET rocksdb-shared) + add_library(RocksDB::rocksdb ALIAS rocksdb-shared) + elseif(TARGET RocksDB::RocksDB) + add_library(RocksDB::rocksdb ALIAS RocksDB::RocksDB) + elseif(DEFINED RocksDB_LIBRARY AND DEFINED RocksDB_INCLUDE_DIR) + add_library(RocksDB::rocksdb UNKNOWN IMPORTED) + set_target_properties( + RocksDB::rocksdb + PROPERTIES IMPORTED_LOCATION "${RocksDB_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${RocksDB_INCLUDE_DIR}") + elseif(DEFINED ROCKSDB_LIBRARIES AND DEFINED ROCKSDB_INCLUDE_DIRS) + add_library(RocksDB::rocksdb UNKNOWN IMPORTED) set_target_properties( - SQLite3::SQLite3 - PROPERTIES IMPORTED_LOCATION "${SQLite3_LIBRARIES}" - INTERFACE_INCLUDE_DIRECTORIES "${SQLite3_INCLUDE_DIRS}") + RocksDB::rocksdb + PROPERTIES IMPORTED_LOCATION "" + INTERFACE_LINK_LIBRARIES "${ROCKSDB_LIBRARIES}" + INTERFACE_INCLUDE_DIRECTORIES "${ROCKSDB_INCLUDE_DIRS}") endif() endif() - # Set variables in parent scope so they persist outside the function - set(SQLite3_FOUND - ${SQLite3_FOUND} - PARENT_SCOPE) - set(SQLite3_LIBRARIES - ${SQLite3_LIBRARIES} - PARENT_SCOPE) - set(SQLite3_INCLUDE_DIRS - ${SQLite3_INCLUDE_DIRS} + if(NOT TARGET RocksDB::rocksdb) + message( + FATAL_ERROR + "need_rocksdb: RocksDB was found but no usable target could be created." + ) + endif() + + set(RocksDB_FOUND + ${RocksDB_FOUND} PARENT_SCOPE) - set(SQLite3_CPM + set(RocksDB_CPM FALSE PARENT_SCOPE) else() - # Build with CPM - if(NOT SQLite3_ADDED) + if(NOT rocksdb_ADDED) cpmaddpackage( NAME - SQLite3 - URL - https://www.sqlite.org/2024/sqlite-amalgamation-3460100.zip + rocksdb + GITHUB_REPOSITORY + facebook/rocksdb VERSION - 3.46.1 - DOWNLOAD_ONLY + 10.10.1 + GIT_TAG + v10.10.1 + OPTIONS + "ROCKSDB_BUILD_SHARED ${DFTRACER_UTILS_BUILD_SHARED}" + "WITH_TESTS OFF" + "WITH_TOOLS OFF" + "WITH_CORE_TOOLS OFF" + "WITH_BENCHMARK_TOOLS OFF" + "WITH_GFLAGS OFF" + "WITH_SNAPPY OFF" + "WITH_LZ4 ON" + "WITH_ZLIB ON" + "WITH_ZSTD OFF" + "WITH_BZ2 OFF" + "USE_RTTI ON" + "FAIL_ON_WARNINGS OFF" + FORCE YES) endif() - if(SQLite3_ADDED) - message(STATUS "Built SQLite3 with CPM") - - set(SQLITE3_TARGETS) - - # Create sqlite3 library from amalgamation - if(DFTRACER_UTILS_BUILD_SHARED) - add_library(sqlite3_shared SHARED ${SQLite3_SOURCE_DIR}/sqlite3.c) - target_include_directories( - sqlite3_shared - PUBLIC $ - $) - - # Enable common SQLite features - target_compile_definitions( - sqlite3_shared PUBLIC SQLITE_ENABLE_FTS5 SQLITE_ENABLE_JSON1 - SQLITE_ENABLE_RTREE SQLITE_THREADSAFE=1) - - if(NOT WIN32) - target_link_libraries(sqlite3_shared PRIVATE pthread dl m) - endif() - - set_target_properties( - sqlite3_shared - PROPERTIES OUTPUT_NAME sqlite3 - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib - ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) - add_library(SQLite::SQLite3 ALIAS sqlite3_shared) - list(APPEND SQLITE3_TARGETS sqlite3_shared) - message(STATUS "Added SQLite3 shared library") + if(TARGET rocksdb AND NOT TARGET RocksDB::rocksdb_static) + add_library(RocksDB::rocksdb_static ALIAS rocksdb) + endif() + if(TARGET rocksdb-shared AND NOT TARGET RocksDB::rocksdb_shared) + add_library(RocksDB::rocksdb_shared ALIAS rocksdb-shared) + endif() + if(NOT TARGET RocksDB::rocksdb) + if(TARGET RocksDB::rocksdb_shared) + add_library(RocksDB::rocksdb ALIAS rocksdb-shared) + elseif(TARGET RocksDB::rocksdb_static) + add_library(RocksDB::rocksdb ALIAS rocksdb) endif() + endif() - if(DFTRACER_UTILS_BUILD_STATIC) - add_library(sqlite3_static STATIC ${SQLite3_SOURCE_DIR}/sqlite3.c) - target_include_directories( - sqlite3_static - PUBLIC $ - $) - - # Enable common SQLite features - target_compile_definitions( - sqlite3_static PUBLIC SQLITE_ENABLE_FTS5 SQLITE_ENABLE_JSON1 - SQLITE_ENABLE_RTREE SQLITE_THREADSAFE=1) + if(rocksdb_ADDED OR TARGET rocksdb OR TARGET rocksdb-shared) + message(STATUS "Built RocksDB with CPM") - if(NOT WIN32) - target_link_libraries(sqlite3_static PRIVATE pthread dl m) - endif() + set(ROCKSDB_LIBRARY_DIR "${CMAKE_BINARY_DIR}/lib") + if(TARGET rocksdb) set_target_properties( - sqlite3_static - PROPERTIES OUTPUT_NAME sqlite3 - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib - ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) - add_library(SQLite::SQLite3_static ALIAS sqlite3_static) - list(APPEND SQLITE3_TARGETS sqlite3_static) - message(STATUS "Added SQLite3 static library") - - # If only static is built, make it the default alias - if(NOT DFTRACER_UTILS_BUILD_SHARED) - add_library(SQLite::SQLite3 ALIAS sqlite3_static) + rocksdb + PROPERTIES POSITION_INDEPENDENT_CODE ON + ARCHIVE_OUTPUT_DIRECTORY "${ROCKSDB_LIBRARY_DIR}" + LIBRARY_OUTPUT_DIRECTORY "${ROCKSDB_LIBRARY_DIR}" + RUNTIME_OUTPUT_DIRECTORY "${ROCKSDB_LIBRARY_DIR}") + target_compile_definitions(rocksdb PUBLIC ROCKSDB_USE_RTTI) + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang|AppleClang") + target_compile_options(rocksdb PRIVATE -frtti) + target_compile_options(rocksdb PUBLIC -Wno-conversion) endif() + install( + TARGETS rocksdb + EXPORT rocksdbTargets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() - - # Make sqlite3 installable - if(SQLITE3_TARGETS) + if(TARGET rocksdb-shared) + set_target_properties( + rocksdb-shared + PROPERTIES POSITION_INDEPENDENT_CODE ON + ARCHIVE_OUTPUT_DIRECTORY "${ROCKSDB_LIBRARY_DIR}" + LIBRARY_OUTPUT_DIRECTORY "${ROCKSDB_LIBRARY_DIR}" + RUNTIME_OUTPUT_DIRECTORY "${ROCKSDB_LIBRARY_DIR}") + target_compile_definitions(rocksdb-shared PUBLIC ROCKSDB_USE_RTTI) + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang|AppleClang") + target_compile_options(rocksdb-shared PRIVATE -frtti) + target_compile_options(rocksdb-shared PUBLIC -Wno-conversion) + endif() install( - TARGETS ${SQLITE3_TARGETS} - EXPORT sqlite3Targets + TARGETS rocksdb-shared + EXPORT rocksdbTargets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() - # Install sqlite3 header - install(FILES ${SQLite3_SOURCE_DIR}/sqlite3.h - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + list(APPEND DEPENDENCY_LIBRARY_DIRS "${ROCKSDB_LIBRARY_DIR}") + list(REMOVE_DUPLICATES DEPENDENCY_LIBRARY_DIRS) + set(DEPENDENCY_LIBRARY_DIRS + "${DEPENDENCY_LIBRARY_DIRS}" + PARENT_SCOPE) - # Install the export set - install( - EXPORT sqlite3Targets - FILE sqlite3Targets.cmake - NAMESPACE SQLite:: - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/sqlite3) + list(APPEND CMAKE_BUILD_RPATH "${ROCKSDB_LIBRARY_DIR}") + list(REMOVE_DUPLICATES CMAKE_BUILD_RPATH) + set(CMAKE_BUILD_RPATH + "${CMAKE_BUILD_RPATH}" + PARENT_SCOPE) - set(SQLite3_CPM + list(APPEND CMAKE_INSTALL_RPATH "${ROCKSDB_LIBRARY_DIR}") + list(REMOVE_DUPLICATES CMAKE_INSTALL_RPATH) + set(CMAKE_INSTALL_RPATH + "${CMAKE_INSTALL_RPATH}" + PARENT_SCOPE) + + set(RocksDB_FOUND + TRUE + PARENT_SCOPE) + set(RocksDB_CPM TRUE PARENT_SCOPE) endif() endif() endfunction() -# Function to link SQLite3 to a target Parameters: TARGET_NAME - name of the -# target to link SQLite3 to -function(link_sqlite3 TARGET_NAME LIBRARY_TYPE) - # Validate parameters +function(link_rocksdb TARGET_NAME LIBRARY_TYPE) if(NOT TARGET_NAME) - message(FATAL_ERROR "link_sqlite3: TARGET_NAME is required") + message(FATAL_ERROR "link_rocksdb: TARGET_NAME is required") endif() - if(NOT TARGET ${TARGET_NAME}) - message(FATAL_ERROR "link_sqlite3: Target '${TARGET_NAME}' does not exist") - endif() - - # Check if any SQLite3 variant is available - set(SQLITE3_AVAILABLE FALSE) - - # Check for CPM-built SQLite3 - if(TARGET sqlite3_shared OR TARGET sqlite3_static) - set(SQLITE3_AVAILABLE TRUE) + if(NOT LIBRARY_TYPE MATCHES "^(STATIC|SHARED)$") + message( + FATAL_ERROR "link_rocksdb: LIBRARY_TYPE must be either STATIC or SHARED") endif() - # Check for system SQLite3 - if(TARGET SQLite::SQLite3) - set(SQLITE3_AVAILABLE TRUE) + if(NOT TARGET ${TARGET_NAME}) + message(FATAL_ERROR "link_rocksdb: Target '${TARGET_NAME}' does not exist") endif() - if(NOT SQLITE3_AVAILABLE) + if(NOT TARGET RocksDB::rocksdb AND NOT TARGET RocksDB::rocksdb_static + AND NOT TARGET RocksDB::rocksdb_shared AND NOT TARGET rocksdb + AND NOT TARGET rocksdb-shared) message( FATAL_ERROR - "link_sqlite3: No SQLite3 found! Call need_sqlite3() first or ensure system SQLite3 is available." + "link_rocksdb: No RocksDB found! Call need_rocksdb() first or ensure system RocksDB is available." ) endif() - # Link appropriate SQLite3 variant Use PUBLIC linkage since sqlite3.h is - # included in public headers if(LIBRARY_TYPE STREQUAL "STATIC") - # For static libraries, prefer static SQLite3 - if(TARGET sqlite3_static) - target_link_libraries(${TARGET_NAME} PUBLIC sqlite3_static) - message( - STATUS "Linked ${TARGET_NAME} to CPM-built sqlite3_static") - elseif(TARGET sqlite3_shared) - target_link_libraries(${TARGET_NAME} PUBLIC sqlite3_shared) - message( - STATUS "Linked ${TARGET_NAME} to CPM-built sqlite3_shared") - elseif(TARGET SQLite3::SQLite3) - target_link_libraries(${TARGET_NAME} PUBLIC SQLite3::SQLite3) - message(STATUS "Linked ${TARGET_NAME} to SQLite3::SQLite3") - elseif(TARGET SQLite::SQLite3) - target_link_libraries(${TARGET_NAME} PUBLIC SQLite::SQLite3) - message(STATUS "Linked ${TARGET_NAME} to SQLite::SQLite3") + if(TARGET RocksDB::rocksdb_static) + target_link_libraries(${TARGET_NAME} PUBLIC RocksDB::rocksdb_static) + message(STATUS "Linked ${TARGET_NAME} to RocksDB::rocksdb_static") + elseif(TARGET rocksdb) + target_link_libraries(${TARGET_NAME} PUBLIC rocksdb) + message(STATUS "Linked ${TARGET_NAME} to rocksdb") + elseif(TARGET RocksDB::rocksdb) + target_link_libraries(${TARGET_NAME} PUBLIC RocksDB::rocksdb) + message(STATUS "Linked ${TARGET_NAME} to RocksDB::rocksdb") + else() + message(FATAL_ERROR "Static RocksDB requested for ${TARGET_NAME}, but no static RocksDB target is available") endif() else() - # For shared libraries, prefer shared SQLite3 - if(TARGET sqlite3_shared) - target_link_libraries(${TARGET_NAME} PUBLIC sqlite3_shared) - message( - STATUS "Linked ${TARGET_NAME} to CPM-built sqlite3_shared") - elseif(TARGET sqlite3_static) - target_link_libraries(${TARGET_NAME} PUBLIC sqlite3_static) - message( - STATUS "Linked ${TARGET_NAME} to CPM-built sqlite3_static") - elseif(TARGET SQLite3::SQLite3) - target_link_libraries(${TARGET_NAME} PUBLIC SQLite3::SQLite3) - message(STATUS "Linked ${TARGET_NAME} to SQLite3::SQLite3") - elseif(TARGET SQLite::SQLite3) - target_link_libraries(${TARGET_NAME} PUBLIC SQLite::SQLite3) - message(STATUS "Linked ${TARGET_NAME} to SQLite::SQLite3") + if(TARGET RocksDB::rocksdb_shared) + target_link_libraries(${TARGET_NAME} PUBLIC RocksDB::rocksdb_shared) + message(STATUS "Linked ${TARGET_NAME} to RocksDB::rocksdb_shared") + elseif(TARGET rocksdb-shared) + target_link_libraries(${TARGET_NAME} PUBLIC rocksdb-shared) + message(STATUS "Linked ${TARGET_NAME} to rocksdb-shared") + elseif(TARGET RocksDB::rocksdb) + target_link_libraries(${TARGET_NAME} PUBLIC RocksDB::rocksdb) + message(STATUS "Linked ${TARGET_NAME} to RocksDB::rocksdb") + elseif(TARGET RocksDB::rocksdb_static) + target_link_libraries(${TARGET_NAME} PUBLIC RocksDB::rocksdb_static) + message(STATUS "Linked ${TARGET_NAME} to RocksDB::rocksdb_static") + elseif(TARGET rocksdb) + target_link_libraries(${TARGET_NAME} PUBLIC rocksdb) + message(STATUS "Linked ${TARGET_NAME} to rocksdb") endif() endif() endfunction() @@ -647,6 +656,189 @@ endfunction() # ============================================================================== # Compression Dependencies # ============================================================================== +function(need_lz4) + if(DEFINED CACHE{lz4_LIBRARIES} AND NOT EXISTS "${lz4_LIBRARIES}") + unset(lz4_LIBRARIES CACHE) + endif() + if(DEFINED CACHE{lz4_INCLUDE_DIRS} AND NOT EXISTS "${lz4_INCLUDE_DIRS}") + unset(lz4_INCLUDE_DIRS CACHE) + endif() + + find_path(lz4_INCLUDE_DIRS NAMES lz4.h) + find_library(lz4_LIBRARIES NAMES lz4) + + if(lz4_INCLUDE_DIRS AND lz4_LIBRARIES AND EXISTS "${lz4_LIBRARIES}") + message(STATUS "Found system lz4: ${lz4_LIBRARIES}") + + if(NOT TARGET lz4::lz4) + add_library(lz4::lz4 UNKNOWN IMPORTED) + set_target_properties( + lz4::lz4 + PROPERTIES IMPORTED_LOCATION "${lz4_LIBRARIES}" + INTERFACE_INCLUDE_DIRECTORIES "${lz4_INCLUDE_DIRS}") + endif() + + set(lz4_FOUND + TRUE + PARENT_SCOPE) + set(lz4_INCLUDE_DIRS + ${lz4_INCLUDE_DIRS} + PARENT_SCOPE) + set(lz4_LIBRARIES + ${lz4_LIBRARIES} + PARENT_SCOPE) + set(lz4_CPM + FALSE + PARENT_SCOPE) + set(lz4_FOUND + TRUE + CACHE BOOL "lz4 availability" FORCE) + set(lz4_INCLUDE_DIRS + "${lz4_INCLUDE_DIRS}" + CACHE PATH "lz4 include directories" FORCE) + set(lz4_LIBRARIES + "${lz4_LIBRARIES}" + CACHE STRING "lz4 libraries" FORCE) + else() + if(NOT lz4_ADDED) + cpmaddpackage( + NAME + lz4 + GITHUB_REPOSITORY + lz4/lz4 + VERSION + 1.10.0 + GIT_TAG + v1.10.0 + DOWNLOAD_ONLY + YES) + endif() + + if(lz4_ADDED) + message(STATUS "Built lz4 with CPM") + + set(LZ4_TARGETS) + set(LZ4_SOURCES + ${lz4_SOURCE_DIR}/lib/lz4.c + ${lz4_SOURCE_DIR}/lib/lz4frame.c + ${lz4_SOURCE_DIR}/lib/lz4hc.c + ${lz4_SOURCE_DIR}/lib/xxhash.c) + set(LZ4_SHARED_OUTPUT + "${CMAKE_BINARY_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}lz4${CMAKE_SHARED_LIBRARY_SUFFIX}" + ) + set(LZ4_STATIC_OUTPUT + "${CMAKE_BINARY_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}lz4${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(LZ4_PREFERRED_OUTPUT "${LZ4_STATIC_OUTPUT}") + if(DFTRACER_UTILS_BUILD_SHARED) + set(LZ4_PREFERRED_OUTPUT "${LZ4_SHARED_OUTPUT}") + endif() + + if(DFTRACER_UTILS_BUILD_STATIC) + add_library(lz4_static STATIC ${LZ4_SOURCES}) + target_include_directories( + lz4_static + PUBLIC $ + $) + set_target_properties( + lz4_static + PROPERTIES OUTPUT_NAME lz4 + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + list(APPEND LZ4_TARGETS lz4_static) + endif() + + if(DFTRACER_UTILS_BUILD_SHARED) + add_library(lz4_shared SHARED ${LZ4_SOURCES}) + target_include_directories( + lz4_shared + PUBLIC $ + $) + set_target_properties( + lz4_shared + PROPERTIES OUTPUT_NAME lz4 + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + list(APPEND LZ4_TARGETS lz4_shared) + endif() + + if(TARGET lz4_static AND NOT TARGET lz4::lz4_static) + add_library(lz4::lz4_static UNKNOWN IMPORTED GLOBAL) + set_target_properties( + lz4::lz4_static + PROPERTIES IMPORTED_LOCATION "${LZ4_STATIC_OUTPUT}" + INTERFACE_INCLUDE_DIRECTORIES "${lz4_SOURCE_DIR}/lib") + add_dependencies(lz4::lz4_static lz4_static) + endif() + if(TARGET lz4_shared AND NOT TARGET lz4::lz4_shared) + add_library(lz4::lz4_shared UNKNOWN IMPORTED GLOBAL) + set_target_properties( + lz4::lz4_shared + PROPERTIES IMPORTED_LOCATION "${LZ4_SHARED_OUTPUT}" + INTERFACE_INCLUDE_DIRECTORIES "${lz4_SOURCE_DIR}/lib") + add_dependencies(lz4::lz4_shared lz4_shared) + endif() + if(NOT TARGET lz4::lz4) + add_library(lz4::lz4 UNKNOWN IMPORTED GLOBAL) + if(TARGET lz4::lz4_shared) + set_target_properties( + lz4::lz4 + PROPERTIES IMPORTED_LOCATION "${LZ4_SHARED_OUTPUT}" + INTERFACE_INCLUDE_DIRECTORIES "${lz4_SOURCE_DIR}/lib") + add_dependencies(lz4::lz4 lz4_shared) + elseif(TARGET lz4::lz4_static) + set_target_properties( + lz4::lz4 + PROPERTIES IMPORTED_LOCATION "${LZ4_STATIC_OUTPUT}" + INTERFACE_INCLUDE_DIRECTORIES "${lz4_SOURCE_DIR}/lib") + add_dependencies(lz4::lz4 lz4_static) + endif() + endif() + + install(FILES ${lz4_SOURCE_DIR}/lib/lz4.h ${lz4_SOURCE_DIR}/lib/lz4frame.h + ${lz4_SOURCE_DIR}/lib/lz4hc.h ${lz4_SOURCE_DIR}/lib/xxhash.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + + if(LZ4_TARGETS) + install( + TARGETS ${LZ4_TARGETS} + EXPORT lz4Targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + install( + EXPORT lz4Targets + FILE lz4Targets.cmake + NAMESPACE lz4:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lz4) + endif() + + set(lz4_FOUND + TRUE + PARENT_SCOPE) + set(lz4_INCLUDE_DIRS + ${lz4_SOURCE_DIR}/lib + PARENT_SCOPE) + set(lz4_LIBRARIES + ${LZ4_PREFERRED_OUTPUT} + PARENT_SCOPE) + set(lz4_CPM + TRUE + PARENT_SCOPE) + + # Seed the variables RocksDB's bundled Findlz4.cmake checks. + set(lz4_FOUND + TRUE + CACHE BOOL "lz4 availability" FORCE) + set(lz4_INCLUDE_DIRS + "${lz4_SOURCE_DIR}/lib" + CACHE PATH "lz4 include directories" FORCE) + set(lz4_LIBRARIES + "${LZ4_PREFERRED_OUTPUT}" + CACHE STRING "lz4 libraries" FORCE) + endif() + endif() +endfunction() function(need_zlib) find_package(ZLIB 1.2 QUIET) diff --git a/cmake/modules/InstallHelpers.cmake b/cmake/modules/InstallHelpers.cmake index 363b3ae1..4c776c93 100644 --- a/cmake/modules/InstallHelpers.cmake +++ b/cmake/modules/InstallHelpers.cmake @@ -196,42 +196,6 @@ else() endif() endif() -# SQLITE3 dependency -find_library(SQLITE3_LIBRARY_BUNDLED - NAMES sqlite3 libsqlite3 - PATHS \${_IMPORT_PREFIX}/lib - NO_DEFAULT_PATH -) - -if(SQLITE3_LIBRARY_BUNDLED) - # Found sqlite3 that was built with this package - find_path(SQLITE3_INCLUDE_DIR_BUNDLED - NAMES sqlite3.h - PATHS \${_IMPORT_PREFIX}/include - NO_DEFAULT_PATH - ) - - if(SQLITE3_INCLUDE_DIR_BUNDLED AND NOT TARGET SQLite::SQLite3) - add_library(SQLite::SQLite3 UNKNOWN IMPORTED) - set_target_properties(SQLite::SQLite3 PROPERTIES - IMPORTED_LOCATION \"\${SQLITE3_LIBRARY_BUNDLED}\" - INTERFACE_INCLUDE_DIRECTORIES \"\${SQLITE3_INCLUDE_DIR_BUNDLED}\" - ) - endif() -else() - # Fall back to system sqlite3 via pkg-config (require minimum version 3.35) - find_dependency(PkgConfig REQUIRED) - pkg_check_modules(SQLITE3 REQUIRED sqlite3>=3.35) - - if(SQLITE3_FOUND AND NOT TARGET SQLite::SQLite3) - add_library(SQLite::SQLite3 UNKNOWN IMPORTED) - set_target_properties(SQLite::SQLite3 PROPERTIES - IMPORTED_LOCATION \"\${SQLITE3_LIBRARIES}\" - INTERFACE_INCLUDE_DIRECTORIES \"\${SQLITE3_INCLUDE_DIRS}\" - ) - endif() -endif() - # YYJSON dependency find_library(YYJSON_LIBRARY_BUNDLED NAMES yyjson libyyjson diff --git a/cmake/modules/LibraryHelpers.cmake b/cmake/modules/LibraryHelpers.cmake index 43b08790..787b23a9 100644 --- a/cmake/modules/LibraryHelpers.cmake +++ b/cmake/modules/LibraryHelpers.cmake @@ -213,6 +213,10 @@ function(create_library) set_target_properties(${TARGET_NAME} PROPERTIES ${LIB_PROPERTIES}) endif() + if(NOT ${TARGET_TYPE} STREQUAL "INTERFACE") + target_add_rpath(${TARGET_NAME}) + endif() + # Enable C++20 coroutines support target_enable_coroutine(${TARGET_NAME}) endmacro() diff --git a/cmake/vendor/CPM_0.42.1.cmake b/cmake/vendor/CPM_0.42.1.cmake new file mode 100644 index 00000000..832977c7 --- /dev/null +++ b/cmake/vendor/CPM_0.42.1.cmake @@ -0,0 +1,1363 @@ +# CPM.cmake - CMake's missing package manager +# =========================================== +# See https://github.com/cpm-cmake/CPM.cmake for usage and update instructions. +# +# MIT License +# ----------- +#[[ + Copyright (c) 2019-2023 Lars Melchior and contributors + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +]] + +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +# Initialize logging prefix +if(NOT CPM_INDENT) + set(CPM_INDENT + "CPM:" + CACHE INTERNAL "" + ) +endif() + +if(NOT COMMAND cpm_message) + function(cpm_message) + message(${ARGV}) + endfunction() +endif() + +if(DEFINED EXTRACTED_CPM_VERSION) + set(CURRENT_CPM_VERSION "${EXTRACTED_CPM_VERSION}${CPM_DEVELOPMENT}") +else() + set(CURRENT_CPM_VERSION 0.42.1) +endif() + +get_filename_component(CPM_CURRENT_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}" REALPATH) +if(CPM_DIRECTORY) + if(NOT CPM_DIRECTORY STREQUAL CPM_CURRENT_DIRECTORY) + if(CPM_VERSION VERSION_LESS CURRENT_CPM_VERSION) + message( + AUTHOR_WARNING + "${CPM_INDENT} \ +A dependency is using a more recent CPM version (${CURRENT_CPM_VERSION}) than the current project (${CPM_VERSION}). \ +It is recommended to upgrade CPM to the most recent version. \ +See https://github.com/cpm-cmake/CPM.cmake for more information." + ) + endif() + if(${CMAKE_VERSION} VERSION_LESS "3.17.0") + include(FetchContent) + endif() + return() + endif() + + get_property( + CPM_INITIALIZED GLOBAL "" + PROPERTY CPM_INITIALIZED + SET + ) + if(CPM_INITIALIZED) + return() + endif() +endif() + +if(CURRENT_CPM_VERSION MATCHES "development-version") + message( + WARNING "${CPM_INDENT} Your project is using an unstable development version of CPM.cmake. \ +Please update to a recent release if possible. \ +See https://github.com/cpm-cmake/CPM.cmake for details." + ) +endif() + +set_property(GLOBAL PROPERTY CPM_INITIALIZED true) + +macro(cpm_set_policies) + # the policy allows us to change options without caching + cmake_policy(SET CMP0077 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + + # the policy allows us to change set(CACHE) without caching + if(POLICY CMP0126) + cmake_policy(SET CMP0126 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0126 NEW) + endif() + + # The policy uses the download time for timestamp, instead of the timestamp in the archive. This + # allows for proper rebuilds when a projects url changes + if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0135 NEW) + endif() + + # treat relative git repository paths as being relative to the parent project's remote + if(POLICY CMP0150) + cmake_policy(SET CMP0150 NEW) + set(CMAKE_POLICY_DEFAULT_CMP0150 NEW) + endif() +endmacro() +cpm_set_policies() + +option(CPM_USE_LOCAL_PACKAGES "Always try to use `find_package` to get dependencies" + $ENV{CPM_USE_LOCAL_PACKAGES} +) +option(CPM_LOCAL_PACKAGES_ONLY "Only use `find_package` to get dependencies" + $ENV{CPM_LOCAL_PACKAGES_ONLY} +) +option(CPM_DOWNLOAD_ALL "Always download dependencies from source" $ENV{CPM_DOWNLOAD_ALL}) +option(CPM_DONT_UPDATE_MODULE_PATH "Don't update the module path to allow using find_package" + $ENV{CPM_DONT_UPDATE_MODULE_PATH} +) +option(CPM_DONT_CREATE_PACKAGE_LOCK "Don't create a package lock file in the binary path" + $ENV{CPM_DONT_CREATE_PACKAGE_LOCK} +) +option(CPM_INCLUDE_ALL_IN_PACKAGE_LOCK + "Add all packages added through CPM.cmake to the package lock" + $ENV{CPM_INCLUDE_ALL_IN_PACKAGE_LOCK} +) +option(CPM_USE_NAMED_CACHE_DIRECTORIES + "Use additional directory of package name in cache on the most nested level." + $ENV{CPM_USE_NAMED_CACHE_DIRECTORIES} +) + +set(CPM_VERSION + ${CURRENT_CPM_VERSION} + CACHE INTERNAL "" +) +set(CPM_DIRECTORY + ${CPM_CURRENT_DIRECTORY} + CACHE INTERNAL "" +) +set(CPM_FILE + ${CMAKE_CURRENT_LIST_FILE} + CACHE INTERNAL "" +) +set(CPM_PACKAGES + "" + CACHE INTERNAL "" +) +set(CPM_DRY_RUN + OFF + CACHE INTERNAL "Don't download or configure dependencies (for testing)" +) + +if(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_SOURCE_CACHE_DEFAULT $ENV{CPM_SOURCE_CACHE}) +else() + set(CPM_SOURCE_CACHE_DEFAULT OFF) +endif() + +set(CPM_SOURCE_CACHE + ${CPM_SOURCE_CACHE_DEFAULT} + CACHE PATH "Directory to download CPM dependencies" +) + +if(NOT CPM_DONT_UPDATE_MODULE_PATH AND NOT DEFINED CMAKE_FIND_PACKAGE_REDIRECTS_DIR) + set(CPM_MODULE_PATH + "${CMAKE_BINARY_DIR}/CPM_modules" + CACHE INTERNAL "" + ) + # remove old modules + file(REMOVE_RECURSE ${CPM_MODULE_PATH}) + file(MAKE_DIRECTORY ${CPM_MODULE_PATH}) + # locally added CPM modules should override global packages + set(CMAKE_MODULE_PATH "${CPM_MODULE_PATH};${CMAKE_MODULE_PATH}") +endif() + +if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + set(CPM_PACKAGE_LOCK_FILE + "${CMAKE_BINARY_DIR}/cpm-package-lock.cmake" + CACHE INTERNAL "" + ) + file(WRITE ${CPM_PACKAGE_LOCK_FILE} + "# CPM Package Lock\n# This file should be committed to version control\n\n" + ) +endif() + +include(FetchContent) + +# Try to infer package name from git repository uri (path or url) +function(cpm_package_name_from_git_uri URI RESULT) + if("${URI}" MATCHES "([^/:]+)/?.git/?$") + set(${RESULT} + ${CMAKE_MATCH_1} + PARENT_SCOPE + ) + else() + unset(${RESULT} PARENT_SCOPE) + endif() +endfunction() + +# Find the shortest hash that can be used eg, if origin_hash is +# cccb77ae9609d2768ed80dd42cec54f77b1f1455 the following files will be checked, until one is found +# that is either empty (allowing us to assign origin_hash), or whose contents matches ${origin_hash} +# +# * .../cccb.hash +# * .../cccb77ae.hash +# * .../cccb77ae9609.hash +# * .../cccb77ae9609d276.hash +# * etc +# +# We will be able to use a shorter path with very high probability, but in the (rare) event that the +# first couple characters collide, we will check longer and longer substrings. +function(cpm_get_shortest_hash source_cache_dir origin_hash short_hash_output_var) + # for compatibility with caches populated by a previous version of CPM, check if a directory using + # the full hash already exists + if(EXISTS "${source_cache_dir}/${origin_hash}") + set(${short_hash_output_var} + "${origin_hash}" + PARENT_SCOPE + ) + return() + endif() + + foreach(len RANGE 4 40 4) + string(SUBSTRING "${origin_hash}" 0 ${len} short_hash) + set(hash_lock ${source_cache_dir}/${short_hash}.lock) + set(hash_fp ${source_cache_dir}/${short_hash}.hash) + # Take a lock, so we don't have a race condition with another instance of cmake. We will release + # this lock when we can, however, if there is an error, we want to ensure it gets released on + # it's own on exit from the function. + file(LOCK ${hash_lock} GUARD FUNCTION) + + # Load the contents of .../${short_hash}.hash + file(TOUCH ${hash_fp}) + file(READ ${hash_fp} hash_fp_contents) + + if(hash_fp_contents STREQUAL "") + # Write the origin hash + file(WRITE ${hash_fp} ${origin_hash}) + file(LOCK ${hash_lock} RELEASE) + break() + elseif(hash_fp_contents STREQUAL origin_hash) + file(LOCK ${hash_lock} RELEASE) + break() + else() + file(LOCK ${hash_lock} RELEASE) + endif() + endforeach() + set(${short_hash_output_var} + "${short_hash}" + PARENT_SCOPE + ) +endfunction() + +# Try to infer package name and version from a url +function(cpm_package_name_and_ver_from_url url outName outVer) + if(url MATCHES "[/\\?]([a-zA-Z0-9_\\.-]+)\\.(tar|tar\\.gz|tar\\.bz2|zip|ZIP)(\\?|/|$)") + # We matched an archive + set(filename "${CMAKE_MATCH_1}") + + if(filename MATCHES "([a-zA-Z0-9_\\.-]+)[_-]v?(([0-9]+\\.)*[0-9]+[a-zA-Z0-9]*)") + # We matched - (ie foo-1.2.3) + set(${outName} + "${CMAKE_MATCH_1}" + PARENT_SCOPE + ) + set(${outVer} + "${CMAKE_MATCH_2}" + PARENT_SCOPE + ) + elseif(filename MATCHES "(([0-9]+\\.)+[0-9]+[a-zA-Z0-9]*)") + # We couldn't find a name, but we found a version + # + # In many cases (which we don't handle here) the url would look something like + # `irrelevant/ACTUAL_PACKAGE_NAME/irrelevant/1.2.3.zip`. In such a case we can't possibly + # distinguish the package name from the irrelevant bits. Moreover if we try to match the + # package name from the filename, we'd get bogus at best. + unset(${outName} PARENT_SCOPE) + set(${outVer} + "${CMAKE_MATCH_1}" + PARENT_SCOPE + ) + else() + # Boldly assume that the file name is the package name. + # + # Yes, something like `irrelevant/ACTUAL_NAME/irrelevant/download.zip` will ruin our day, but + # such cases should be quite rare. No popular service does this... we think. + set(${outName} + "${filename}" + PARENT_SCOPE + ) + unset(${outVer} PARENT_SCOPE) + endif() + else() + # No ideas yet what to do with non-archives + unset(${outName} PARENT_SCOPE) + unset(${outVer} PARENT_SCOPE) + endif() +endfunction() + +function(cpm_find_package NAME VERSION) + string(REPLACE " " ";" EXTRA_ARGS "${ARGN}") + find_package(${NAME} ${VERSION} ${EXTRA_ARGS} QUIET) + if(${CPM_ARGS_NAME}_FOUND) + if(DEFINED ${CPM_ARGS_NAME}_VERSION) + set(VERSION ${${CPM_ARGS_NAME}_VERSION}) + endif() + cpm_message(STATUS "${CPM_INDENT} Using local package ${CPM_ARGS_NAME}@${VERSION}") + CPMRegisterPackage(${CPM_ARGS_NAME} "${VERSION}") + set(CPM_PACKAGE_FOUND + YES + PARENT_SCOPE + ) + else() + set(CPM_PACKAGE_FOUND + NO + PARENT_SCOPE + ) + endif() +endfunction() + +# Create a custom FindXXX.cmake module for a CPM package This prevents `find_package(NAME)` from +# finding the system library +function(cpm_create_module_file Name) + if(NOT CPM_DONT_UPDATE_MODULE_PATH) + if(DEFINED CMAKE_FIND_PACKAGE_REDIRECTS_DIR) + # Redirect find_package calls to the CPM package. This is what FetchContent does when you set + # OVERRIDE_FIND_PACKAGE. The CMAKE_FIND_PACKAGE_REDIRECTS_DIR works for find_package in CONFIG + # mode, unlike the Find${Name}.cmake fallback. CMAKE_FIND_PACKAGE_REDIRECTS_DIR is not defined + # in script mode, or in CMake < 3.24. + # https://cmake.org/cmake/help/latest/module/FetchContent.html#fetchcontent-find-package-integration-examples + string(TOLOWER ${Name} NameLower) + file(WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/${NameLower}-config.cmake + "include(\"\${CMAKE_CURRENT_LIST_DIR}/${NameLower}-extra.cmake\" OPTIONAL)\n" + "include(\"\${CMAKE_CURRENT_LIST_DIR}/${Name}Extra.cmake\" OPTIONAL)\n" + ) + file(WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/${NameLower}-config-version.cmake + "set(PACKAGE_VERSION_COMPATIBLE TRUE)\n" "set(PACKAGE_VERSION_EXACT TRUE)\n" + ) + else() + file(WRITE ${CPM_MODULE_PATH}/Find${Name}.cmake + "include(\"${CPM_FILE}\")\n${ARGN}\nset(${Name}_FOUND TRUE)" + ) + endif() + endif() +endfunction() + +# Find a package locally or fallback to CPMAddPackage +function(CPMFindPackage) + set(oneValueArgs NAME VERSION GIT_TAG FIND_PACKAGE_ARGUMENTS) + + cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "" ${ARGN}) + + if(NOT DEFINED CPM_ARGS_VERSION) + if(DEFINED CPM_ARGS_GIT_TAG) + cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION) + endif() + endif() + + set(downloadPackage ${CPM_DOWNLOAD_ALL}) + if(DEFINED CPM_DOWNLOAD_${CPM_ARGS_NAME}) + set(downloadPackage ${CPM_DOWNLOAD_${CPM_ARGS_NAME}}) + elseif(DEFINED ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}}) + set(downloadPackage $ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}}) + endif() + if(downloadPackage) + CPMAddPackage(${ARGN}) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}) + + if(NOT CPM_PACKAGE_FOUND) + CPMAddPackage(${ARGN}) + cpm_export_variables(${CPM_ARGS_NAME}) + endif() + +endfunction() + +# checks if a package has been added before +function(cpm_check_if_package_already_added CPM_ARGS_NAME CPM_ARGS_VERSION) + if("${CPM_ARGS_NAME}" IN_LIST CPM_PACKAGES) + CPMGetPackageVersion(${CPM_ARGS_NAME} CPM_PACKAGE_VERSION) + if("${CPM_PACKAGE_VERSION}" VERSION_LESS "${CPM_ARGS_VERSION}") + message( + WARNING + "${CPM_INDENT} Requires a newer version of ${CPM_ARGS_NAME} (${CPM_ARGS_VERSION}) than currently included (${CPM_PACKAGE_VERSION})." + ) + endif() + cpm_get_fetch_properties(${CPM_ARGS_NAME}) + set(${CPM_ARGS_NAME}_ADDED NO) + set(CPM_PACKAGE_ALREADY_ADDED + YES + PARENT_SCOPE + ) + cpm_export_variables(${CPM_ARGS_NAME}) + else() + set(CPM_PACKAGE_ALREADY_ADDED + NO + PARENT_SCOPE + ) + endif() +endfunction() + +# Parse the argument of CPMAddPackage in case a single one was provided and convert it to a list of +# arguments which can then be parsed idiomatically. For example gh:foo/bar@1.2.3 will be converted +# to: GITHUB_REPOSITORY;foo/bar;VERSION;1.2.3 +function(cpm_parse_add_package_single_arg arg outArgs) + # Look for a scheme + if("${arg}" MATCHES "^([a-zA-Z]+):(.+)$") + string(TOLOWER "${CMAKE_MATCH_1}" scheme) + set(uri "${CMAKE_MATCH_2}") + + # Check for CPM-specific schemes + if(scheme STREQUAL "gh") + set(out "GITHUB_REPOSITORY;${uri}") + set(packageType "git") + elseif(scheme STREQUAL "gl") + set(out "GITLAB_REPOSITORY;${uri}") + set(packageType "git") + elseif(scheme STREQUAL "bb") + set(out "BITBUCKET_REPOSITORY;${uri}") + set(packageType "git") + # A CPM-specific scheme was not found. Looks like this is a generic URL so try to determine + # type + elseif(arg MATCHES ".git/?(@|#|$)") + set(out "GIT_REPOSITORY;${arg}") + set(packageType "git") + else() + # Fall back to a URL + set(out "URL;${arg}") + set(packageType "archive") + + # We could also check for SVN since FetchContent supports it, but SVN is so rare these days. + # We just won't bother with the additional complexity it will induce in this function. SVN is + # done by multi-arg + endif() + else() + if(arg MATCHES ".git/?(@|#|$)") + set(out "GIT_REPOSITORY;${arg}") + set(packageType "git") + else() + # Give up + message(FATAL_ERROR "${CPM_INDENT} Can't determine package type of '${arg}'") + endif() + endif() + + # For all packages we interpret @... as version. Only replace the last occurrence. Thus URIs + # containing '@' can be used + string(REGEX REPLACE "@([^@]+)$" ";VERSION;\\1" out "${out}") + + # Parse the rest according to package type + if(packageType STREQUAL "git") + # For git repos we interpret #... as a tag or branch or commit hash + string(REGEX REPLACE "#([^#]+)$" ";GIT_TAG;\\1" out "${out}") + elseif(packageType STREQUAL "archive") + # For archives we interpret #... as a URL hash. + string(REGEX REPLACE "#([^#]+)$" ";URL_HASH;\\1" out "${out}") + # We don't try to parse the version if it's not provided explicitly. cpm_get_version_from_url + # should do this at a later point + else() + # We should never get here. This is an assertion and hitting it means there's a problem with the + # code above. A packageType was set, but not handled by this if-else. + message(FATAL_ERROR "${CPM_INDENT} Unsupported package type '${packageType}' of '${arg}'") + endif() + + set(${outArgs} + ${out} + PARENT_SCOPE + ) +endfunction() + +# Check that the working directory for a git repo is clean +function(cpm_check_git_working_dir_is_clean repoPath gitTag isClean) + + find_package(Git REQUIRED) + + if(NOT GIT_EXECUTABLE) + # No git executable, assume directory is clean + set(${isClean} + TRUE + PARENT_SCOPE + ) + return() + endif() + + # check for uncommitted changes + execute_process( + COMMAND ${GIT_EXECUTABLE} status --porcelain + RESULT_VARIABLE resultGitStatus + OUTPUT_VARIABLE repoStatus + OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET + WORKING_DIRECTORY ${repoPath} + ) + if(resultGitStatus) + # not supposed to happen, assume clean anyway + message(WARNING "${CPM_INDENT} Calling git status on folder ${repoPath} failed") + set(${isClean} + TRUE + PARENT_SCOPE + ) + return() + endif() + + if(NOT "${repoStatus}" STREQUAL "") + set(${isClean} + FALSE + PARENT_SCOPE + ) + return() + endif() + + # check for committed changes + execute_process( + COMMAND ${GIT_EXECUTABLE} diff -s --exit-code ${gitTag} + RESULT_VARIABLE resultGitDiff + OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_QUIET + WORKING_DIRECTORY ${repoPath} + ) + + if(${resultGitDiff} EQUAL 0) + set(${isClean} + TRUE + PARENT_SCOPE + ) + else() + set(${isClean} + FALSE + PARENT_SCOPE + ) + endif() + +endfunction() + +# Add PATCH_COMMAND to CPM_ARGS_UNPARSED_ARGUMENTS. This method consumes a list of files in ARGN +# then generates a `PATCH_COMMAND` appropriate for `ExternalProject_Add()`. This command is appended +# to the parent scope's `CPM_ARGS_UNPARSED_ARGUMENTS`. +function(cpm_add_patches) + # Return if no patch files are supplied. + if(NOT ARGN) + return() + endif() + + # Find the patch program. + find_program(PATCH_EXECUTABLE patch) + if(CMAKE_HOST_WIN32 AND NOT PATCH_EXECUTABLE) + # The Windows git executable is distributed with patch.exe. Find the path to the executable, if + # it exists, then search `../usr/bin` and `../../usr/bin` for patch.exe. + find_package(Git QUIET) + if(GIT_EXECUTABLE) + get_filename_component(extra_search_path ${GIT_EXECUTABLE} DIRECTORY) + get_filename_component(extra_search_path_1up ${extra_search_path} DIRECTORY) + get_filename_component(extra_search_path_2up ${extra_search_path_1up} DIRECTORY) + find_program( + PATCH_EXECUTABLE patch HINTS "${extra_search_path_1up}/usr/bin" + "${extra_search_path_2up}/usr/bin" + ) + endif() + endif() + if(NOT PATCH_EXECUTABLE) + message(FATAL_ERROR "Couldn't find `patch` executable to use with PATCHES keyword.") + endif() + + # Create a temporary + set(temp_list ${CPM_ARGS_UNPARSED_ARGUMENTS}) + + # Ensure each file exists (or error out) and add it to the list. + set(first_item True) + foreach(PATCH_FILE ${ARGN}) + # Make sure the patch file exists, if we can't find it, try again in the current directory. + if(NOT EXISTS "${PATCH_FILE}") + if(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}") + message(FATAL_ERROR "Couldn't find patch file: '${PATCH_FILE}'") + endif() + set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}") + endif() + + # Convert to absolute path for use with patch file command. + get_filename_component(PATCH_FILE "${PATCH_FILE}" ABSOLUTE) + + # The first patch entry must be preceded by "PATCH_COMMAND" while the following items are + # preceded by "&&". + if(first_item) + set(first_item False) + list(APPEND temp_list "PATCH_COMMAND") + else() + list(APPEND temp_list "&&") + endif() + # Add the patch command to the list + list(APPEND temp_list "${PATCH_EXECUTABLE}" "-p1" "<" "${PATCH_FILE}") + endforeach() + + # Move temp out into parent scope. + set(CPM_ARGS_UNPARSED_ARGUMENTS + ${temp_list} + PARENT_SCOPE + ) + +endfunction() + +# method to overwrite internal FetchContent properties, to allow using CPM.cmake to overload +# FetchContent calls. As these are internal cmake properties, this method should be used carefully +# and may need modification in future CMake versions. Source: +# https://github.com/Kitware/CMake/blob/dc3d0b5a0a7d26d43d6cfeb511e224533b5d188f/Modules/FetchContent.cmake#L1152 +function(cpm_override_fetchcontent contentName) + cmake_parse_arguments(PARSE_ARGV 1 arg "" "SOURCE_DIR;BINARY_DIR" "") + if(NOT "${arg_UNPARSED_ARGUMENTS}" STREQUAL "") + message(FATAL_ERROR "${CPM_INDENT} Unsupported arguments: ${arg_UNPARSED_ARGUMENTS}") + endif() + + string(TOLOWER ${contentName} contentNameLower) + set(prefix "_FetchContent_${contentNameLower}") + + set(propertyName "${prefix}_sourceDir") + define_property( + GLOBAL + PROPERTY ${propertyName} + BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" + FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" + ) + set_property(GLOBAL PROPERTY ${propertyName} "${arg_SOURCE_DIR}") + + set(propertyName "${prefix}_binaryDir") + define_property( + GLOBAL + PROPERTY ${propertyName} + BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" + FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" + ) + set_property(GLOBAL PROPERTY ${propertyName} "${arg_BINARY_DIR}") + + set(propertyName "${prefix}_populated") + define_property( + GLOBAL + PROPERTY ${propertyName} + BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" + FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" + ) + set_property(GLOBAL PROPERTY ${propertyName} TRUE) +endfunction() + +# Download and add a package from source +function(CPMAddPackage) + cpm_set_policies() + + set(oneValueArgs + NAME + FORCE + VERSION + GIT_TAG + DOWNLOAD_ONLY + GITHUB_REPOSITORY + GITLAB_REPOSITORY + BITBUCKET_REPOSITORY + GIT_REPOSITORY + SOURCE_DIR + FIND_PACKAGE_ARGUMENTS + NO_CACHE + SYSTEM + GIT_SHALLOW + EXCLUDE_FROM_ALL + SOURCE_SUBDIR + CUSTOM_CACHE_KEY + ) + + set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND PATCHES) + + list(LENGTH ARGN argnLength) + + # Parse single shorthand argument + if(argnLength EQUAL 1) + cpm_parse_add_package_single_arg("${ARGN}" ARGN) + + # The shorthand syntax implies EXCLUDE_FROM_ALL and SYSTEM + set(ARGN "${ARGN};EXCLUDE_FROM_ALL;YES;SYSTEM;YES;") + + # Parse URI shorthand argument + elseif(argnLength GREATER 1 AND "${ARGV0}" STREQUAL "URI") + list(REMOVE_AT ARGN 0 1) # remove "URI gh:<...>@version#tag" + cpm_parse_add_package_single_arg("${ARGV1}" ARGV0) + + set(ARGN "${ARGV0};EXCLUDE_FROM_ALL;YES;SYSTEM;YES;${ARGN}") + endif() + + cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") + + # Set default values for arguments + if(NOT DEFINED CPM_ARGS_VERSION) + if(DEFINED CPM_ARGS_GIT_TAG) + cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION) + endif() + endif() + + if(CPM_ARGS_DOWNLOAD_ONLY) + set(DOWNLOAD_ONLY ${CPM_ARGS_DOWNLOAD_ONLY}) + else() + set(DOWNLOAD_ONLY NO) + endif() + + if(DEFINED CPM_ARGS_GITHUB_REPOSITORY) + set(CPM_ARGS_GIT_REPOSITORY "https://github.com/${CPM_ARGS_GITHUB_REPOSITORY}.git") + elseif(DEFINED CPM_ARGS_GITLAB_REPOSITORY) + set(CPM_ARGS_GIT_REPOSITORY "https://gitlab.com/${CPM_ARGS_GITLAB_REPOSITORY}.git") + elseif(DEFINED CPM_ARGS_BITBUCKET_REPOSITORY) + set(CPM_ARGS_GIT_REPOSITORY "https://bitbucket.org/${CPM_ARGS_BITBUCKET_REPOSITORY}.git") + endif() + + if(DEFINED CPM_ARGS_GIT_REPOSITORY) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_REPOSITORY ${CPM_ARGS_GIT_REPOSITORY}) + if(NOT DEFINED CPM_ARGS_GIT_TAG) + set(CPM_ARGS_GIT_TAG v${CPM_ARGS_VERSION}) + endif() + + # If a name wasn't provided, try to infer it from the git repo + if(NOT DEFINED CPM_ARGS_NAME) + cpm_package_name_from_git_uri(${CPM_ARGS_GIT_REPOSITORY} CPM_ARGS_NAME) + endif() + endif() + + set(CPM_SKIP_FETCH FALSE) + + if(DEFINED CPM_ARGS_GIT_TAG) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_TAG ${CPM_ARGS_GIT_TAG}) + # If GIT_SHALLOW is explicitly specified, honor the value. + if(DEFINED CPM_ARGS_GIT_SHALLOW) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW ${CPM_ARGS_GIT_SHALLOW}) + endif() + endif() + + if(DEFINED CPM_ARGS_URL) + # If a name or version aren't provided, try to infer them from the URL + list(GET CPM_ARGS_URL 0 firstUrl) + cpm_package_name_and_ver_from_url(${firstUrl} nameFromUrl verFromUrl) + # If we fail to obtain name and version from the first URL, we could try other URLs if any. + # However multiple URLs are expected to be quite rare, so for now we won't bother. + + # If the caller provided their own name and version, they trump the inferred ones. + if(NOT DEFINED CPM_ARGS_NAME) + set(CPM_ARGS_NAME ${nameFromUrl}) + endif() + if(NOT DEFINED CPM_ARGS_VERSION) + set(CPM_ARGS_VERSION ${verFromUrl}) + endif() + + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS URL "${CPM_ARGS_URL}") + endif() + + # Check for required arguments + + if(NOT DEFINED CPM_ARGS_NAME) + message( + FATAL_ERROR + "${CPM_INDENT} 'NAME' was not provided and couldn't be automatically inferred for package added with arguments: '${ARGN}'" + ) + endif() + + # Check if package has been added before + cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}") + if(CPM_PACKAGE_ALREADY_ADDED) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + # Check for manual overrides + if(NOT CPM_ARGS_FORCE AND NOT "${CPM_${CPM_ARGS_NAME}_SOURCE}" STREQUAL "") + set(PACKAGE_SOURCE ${CPM_${CPM_ARGS_NAME}_SOURCE}) + set(CPM_${CPM_ARGS_NAME}_SOURCE "") + CPMAddPackage( + NAME "${CPM_ARGS_NAME}" + SOURCE_DIR "${PACKAGE_SOURCE}" + EXCLUDE_FROM_ALL "${CPM_ARGS_EXCLUDE_FROM_ALL}" + SYSTEM "${CPM_ARGS_SYSTEM}" + PATCHES "${CPM_ARGS_PATCHES}" + OPTIONS "${CPM_ARGS_OPTIONS}" + SOURCE_SUBDIR "${CPM_ARGS_SOURCE_SUBDIR}" + DOWNLOAD_ONLY "${DOWNLOAD_ONLY}" + FORCE True + ) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + # Check for available declaration + if(NOT CPM_ARGS_FORCE AND NOT "${CPM_DECLARATION_${CPM_ARGS_NAME}}" STREQUAL "") + set(declaration ${CPM_DECLARATION_${CPM_ARGS_NAME}}) + set(CPM_DECLARATION_${CPM_ARGS_NAME} "") + CPMAddPackage(${declaration}) + cpm_export_variables(${CPM_ARGS_NAME}) + # checking again to ensure version and option compatibility + cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}") + return() + endif() + + if(NOT CPM_ARGS_FORCE) + if(CPM_USE_LOCAL_PACKAGES OR CPM_LOCAL_PACKAGES_ONLY) + cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}) + + if(CPM_PACKAGE_FOUND) + cpm_export_variables(${CPM_ARGS_NAME}) + return() + endif() + + if(CPM_LOCAL_PACKAGES_ONLY) + message( + SEND_ERROR + "${CPM_INDENT} ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})" + ) + endif() + endif() + endif() + + CPMRegisterPackage("${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}") + + if(DEFINED CPM_ARGS_GIT_TAG) + set(PACKAGE_INFO "${CPM_ARGS_GIT_TAG}") + elseif(DEFINED CPM_ARGS_SOURCE_DIR) + set(PACKAGE_INFO "${CPM_ARGS_SOURCE_DIR}") + else() + set(PACKAGE_INFO "${CPM_ARGS_VERSION}") + endif() + + if(DEFINED FETCHCONTENT_BASE_DIR) + # respect user's FETCHCONTENT_BASE_DIR if set + set(CPM_FETCHCONTENT_BASE_DIR ${FETCHCONTENT_BASE_DIR}) + else() + set(CPM_FETCHCONTENT_BASE_DIR ${CMAKE_BINARY_DIR}/_deps) + endif() + + cpm_add_patches(${CPM_ARGS_PATCHES}) + + if(DEFINED CPM_ARGS_DOWNLOAD_COMMAND) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS DOWNLOAD_COMMAND ${CPM_ARGS_DOWNLOAD_COMMAND}) + elseif(DEFINED CPM_ARGS_SOURCE_DIR) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${CPM_ARGS_SOURCE_DIR}) + if(NOT IS_ABSOLUTE ${CPM_ARGS_SOURCE_DIR}) + # Expand `CPM_ARGS_SOURCE_DIR` relative path. This is important because EXISTS doesn't work + # for relative paths. + get_filename_component( + source_directory ${CPM_ARGS_SOURCE_DIR} REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR} + ) + else() + set(source_directory ${CPM_ARGS_SOURCE_DIR}) + endif() + if(NOT EXISTS ${source_directory}) + string(TOLOWER ${CPM_ARGS_NAME} lower_case_name) + # remove timestamps so CMake will re-download the dependency + file(REMOVE_RECURSE "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild") + endif() + elseif(CPM_SOURCE_CACHE AND NOT CPM_ARGS_NO_CACHE) + string(TOLOWER ${CPM_ARGS_NAME} lower_case_name) + set(origin_parameters ${CPM_ARGS_UNPARSED_ARGUMENTS}) + list(SORT origin_parameters) + if(CPM_ARGS_CUSTOM_CACHE_KEY) + # Application set a custom unique directory name + set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${CPM_ARGS_CUSTOM_CACHE_KEY}) + elseif(CPM_USE_NAMED_CACHE_DIRECTORIES) + string(SHA1 origin_hash "${origin_parameters};NEW_CACHE_STRUCTURE_TAG") + cpm_get_shortest_hash( + "${CPM_SOURCE_CACHE}/${lower_case_name}" # source cache directory + "${origin_hash}" # Input hash + origin_hash # Computed hash + ) + set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}/${CPM_ARGS_NAME}) + else() + string(SHA1 origin_hash "${origin_parameters}") + cpm_get_shortest_hash( + "${CPM_SOURCE_CACHE}/${lower_case_name}" # source cache directory + "${origin_hash}" # Input hash + origin_hash # Computed hash + ) + set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}) + endif() + # Expand `download_directory` relative path. This is important because EXISTS doesn't work for + # relative paths. + get_filename_component(download_directory ${download_directory} ABSOLUTE) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${download_directory}) + + if(CPM_SOURCE_CACHE) + file(LOCK ${download_directory}/../cmake.lock) + endif() + + if(EXISTS ${download_directory}) + if(CPM_SOURCE_CACHE) + file(LOCK ${download_directory}/../cmake.lock RELEASE) + endif() + + cpm_store_fetch_properties( + ${CPM_ARGS_NAME} "${download_directory}" + "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build" + ) + cpm_get_fetch_properties("${CPM_ARGS_NAME}") + + if(DEFINED CPM_ARGS_GIT_TAG AND NOT (PATCH_COMMAND IN_LIST CPM_ARGS_UNPARSED_ARGUMENTS)) + # warn if cache has been changed since checkout + cpm_check_git_working_dir_is_clean(${download_directory} ${CPM_ARGS_GIT_TAG} IS_CLEAN) + if(NOT ${IS_CLEAN}) + message( + WARNING "${CPM_INDENT} Cache for ${CPM_ARGS_NAME} (${download_directory}) is dirty" + ) + endif() + endif() + + cpm_add_subdirectory( + "${CPM_ARGS_NAME}" + "${DOWNLOAD_ONLY}" + "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" + "${${CPM_ARGS_NAME}_BINARY_DIR}" + "${CPM_ARGS_EXCLUDE_FROM_ALL}" + "${CPM_ARGS_SYSTEM}" + "${CPM_ARGS_OPTIONS}" + ) + set(PACKAGE_INFO "${PACKAGE_INFO} at ${download_directory}") + + # As the source dir is already cached/populated, we override the call to FetchContent. + set(CPM_SKIP_FETCH TRUE) + cpm_override_fetchcontent( + "${lower_case_name}" SOURCE_DIR "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" + BINARY_DIR "${${CPM_ARGS_NAME}_BINARY_DIR}" + ) + + else() + # Enable shallow clone when GIT_TAG is not a commit hash. Our guess may not be accurate, but + # it should guarantee no commit hash get mis-detected. + if(NOT DEFINED CPM_ARGS_GIT_SHALLOW) + cpm_is_git_tag_commit_hash("${CPM_ARGS_GIT_TAG}" IS_HASH) + if(NOT ${IS_HASH}) + list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW TRUE) + endif() + endif() + + # remove timestamps so CMake will re-download the dependency + file(REMOVE_RECURSE ${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild) + set(PACKAGE_INFO "${PACKAGE_INFO} to ${download_directory}") + endif() + endif() + + if(NOT "${DOWNLOAD_ONLY}") + cpm_create_module_file(${CPM_ARGS_NAME} "CPMAddPackage(\"${ARGN}\")") + endif() + + if(CPM_PACKAGE_LOCK_ENABLED) + if((CPM_ARGS_VERSION AND NOT CPM_ARGS_SOURCE_DIR) OR CPM_INCLUDE_ALL_IN_PACKAGE_LOCK) + cpm_add_to_package_lock(${CPM_ARGS_NAME} "${ARGN}") + elseif(CPM_ARGS_SOURCE_DIR) + cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "local directory") + else() + cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "${ARGN}") + endif() + endif() + + cpm_message( + STATUS "${CPM_INDENT} Adding package ${CPM_ARGS_NAME}@${CPM_ARGS_VERSION} (${PACKAGE_INFO})" + ) + + if(NOT CPM_SKIP_FETCH) + # CMake 3.28 added EXCLUDE, SYSTEM (3.25), and SOURCE_SUBDIR (3.18) to FetchContent_Declare. + # Calling FetchContent_MakeAvailable will then internally forward these options to + # add_subdirectory. Up until these changes, we had to call FetchContent_Populate and + # add_subdirectory separately, which is no longer necessary and has been deprecated as of 3.30. + # A Bug in CMake prevents us to use the non-deprecated functions until 3.30.3. + set(fetchContentDeclareExtraArgs "") + if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.30.3") + if(${CPM_ARGS_EXCLUDE_FROM_ALL}) + list(APPEND fetchContentDeclareExtraArgs EXCLUDE_FROM_ALL) + endif() + if(${CPM_ARGS_SYSTEM}) + list(APPEND fetchContentDeclareExtraArgs SYSTEM) + endif() + if(DEFINED CPM_ARGS_SOURCE_SUBDIR) + list(APPEND fetchContentDeclareExtraArgs SOURCE_SUBDIR ${CPM_ARGS_SOURCE_SUBDIR}) + endif() + # For CMake version <3.28 OPTIONS are parsed in cpm_add_subdirectory + if(CPM_ARGS_OPTIONS AND NOT DOWNLOAD_ONLY) + foreach(OPTION ${CPM_ARGS_OPTIONS}) + cpm_parse_option("${OPTION}") + set(${OPTION_KEY} "${OPTION_VALUE}") + endforeach() + endif() + endif() + cpm_declare_fetch( + "${CPM_ARGS_NAME}" ${fetchContentDeclareExtraArgs} "${CPM_ARGS_UNPARSED_ARGUMENTS}" + ) + + cpm_fetch_package("${CPM_ARGS_NAME}" ${DOWNLOAD_ONLY} populated ${CPM_ARGS_UNPARSED_ARGUMENTS}) + if(CPM_SOURCE_CACHE AND download_directory) + file(LOCK ${download_directory}/../cmake.lock RELEASE) + endif() + if(${populated} AND ${CMAKE_VERSION} VERSION_LESS "3.30.3") + cpm_add_subdirectory( + "${CPM_ARGS_NAME}" + "${DOWNLOAD_ONLY}" + "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" + "${${CPM_ARGS_NAME}_BINARY_DIR}" + "${CPM_ARGS_EXCLUDE_FROM_ALL}" + "${CPM_ARGS_SYSTEM}" + "${CPM_ARGS_OPTIONS}" + ) + endif() + cpm_get_fetch_properties("${CPM_ARGS_NAME}") + endif() + + set(${CPM_ARGS_NAME}_ADDED YES) + cpm_export_variables("${CPM_ARGS_NAME}") +endfunction() + +# Fetch a previously declared package +macro(CPMGetPackage Name) + if(DEFINED "CPM_DECLARATION_${Name}") + CPMAddPackage(NAME ${Name}) + else() + message(SEND_ERROR "${CPM_INDENT} Cannot retrieve package ${Name}: no declaration available") + endif() +endmacro() + +# export variables available to the caller to the parent scope expects ${CPM_ARGS_NAME} to be set +macro(cpm_export_variables name) + set(${name}_SOURCE_DIR + "${${name}_SOURCE_DIR}" + PARENT_SCOPE + ) + set(${name}_BINARY_DIR + "${${name}_BINARY_DIR}" + PARENT_SCOPE + ) + set(${name}_ADDED + "${${name}_ADDED}" + PARENT_SCOPE + ) + set(CPM_LAST_PACKAGE_NAME + "${name}" + PARENT_SCOPE + ) +endmacro() + +# declares a package, so that any call to CPMAddPackage for the package name will use these +# arguments instead. Previous declarations will not be overridden. +macro(CPMDeclarePackage Name) + if(NOT DEFINED "CPM_DECLARATION_${Name}") + set("CPM_DECLARATION_${Name}" "${ARGN}") + endif() +endmacro() + +function(cpm_add_to_package_lock Name) + if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + cpm_prettify_package_arguments(PRETTY_ARGN false ${ARGN}) + file(APPEND ${CPM_PACKAGE_LOCK_FILE} "# ${Name}\nCPMDeclarePackage(${Name}\n${PRETTY_ARGN})\n") + endif() +endfunction() + +function(cpm_add_comment_to_package_lock Name) + if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + cpm_prettify_package_arguments(PRETTY_ARGN true ${ARGN}) + file(APPEND ${CPM_PACKAGE_LOCK_FILE} + "# ${Name} (unversioned)\n# CPMDeclarePackage(${Name}\n${PRETTY_ARGN}#)\n" + ) + endif() +endfunction() + +# includes the package lock file if it exists and creates a target `cpm-update-package-lock` to +# update it +macro(CPMUsePackageLock file) + if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) + get_filename_component(CPM_ABSOLUTE_PACKAGE_LOCK_PATH ${file} ABSOLUTE) + if(EXISTS ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}) + include(${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}) + endif() + if(NOT TARGET cpm-update-package-lock) + add_custom_target( + cpm-update-package-lock COMMAND ${CMAKE_COMMAND} -E copy ${CPM_PACKAGE_LOCK_FILE} + ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH} + ) + endif() + set(CPM_PACKAGE_LOCK_ENABLED true) + endif() +endmacro() + +# registers a package that has been added to CPM +function(CPMRegisterPackage PACKAGE VERSION) + list(APPEND CPM_PACKAGES ${PACKAGE}) + set(CPM_PACKAGES + ${CPM_PACKAGES} + CACHE INTERNAL "" + ) + set("CPM_PACKAGE_${PACKAGE}_VERSION" + ${VERSION} + CACHE INTERNAL "" + ) +endfunction() + +# retrieve the current version of the package to ${OUTPUT} +function(CPMGetPackageVersion PACKAGE OUTPUT) + set(${OUTPUT} + "${CPM_PACKAGE_${PACKAGE}_VERSION}" + PARENT_SCOPE + ) +endfunction() + +# declares a package in FetchContent_Declare +function(cpm_declare_fetch PACKAGE) + if(${CPM_DRY_RUN}) + cpm_message(STATUS "${CPM_INDENT} Package not declared (dry run)") + return() + endif() + + FetchContent_Declare(${PACKAGE} ${ARGN}) +endfunction() + +# returns properties for a package previously defined by cpm_declare_fetch +function(cpm_get_fetch_properties PACKAGE) + if(${CPM_DRY_RUN}) + return() + endif() + + set(${PACKAGE}_SOURCE_DIR + "${CPM_PACKAGE_${PACKAGE}_SOURCE_DIR}" + PARENT_SCOPE + ) + set(${PACKAGE}_BINARY_DIR + "${CPM_PACKAGE_${PACKAGE}_BINARY_DIR}" + PARENT_SCOPE + ) +endfunction() + +function(cpm_store_fetch_properties PACKAGE source_dir binary_dir) + if(${CPM_DRY_RUN}) + return() + endif() + + set(CPM_PACKAGE_${PACKAGE}_SOURCE_DIR + "${source_dir}" + CACHE INTERNAL "" + ) + set(CPM_PACKAGE_${PACKAGE}_BINARY_DIR + "${binary_dir}" + CACHE INTERNAL "" + ) +endfunction() + +# adds a package as a subdirectory if viable, according to provided options +function( + cpm_add_subdirectory + PACKAGE + DOWNLOAD_ONLY + SOURCE_DIR + BINARY_DIR + EXCLUDE + SYSTEM + OPTIONS +) + + if(NOT DOWNLOAD_ONLY AND EXISTS ${SOURCE_DIR}/CMakeLists.txt) + set(addSubdirectoryExtraArgs "") + if(EXCLUDE) + list(APPEND addSubdirectoryExtraArgs EXCLUDE_FROM_ALL) + endif() + if("${SYSTEM}" AND "${CMAKE_VERSION}" VERSION_GREATER_EQUAL "3.25") + # https://cmake.org/cmake/help/latest/prop_dir/SYSTEM.html#prop_dir:SYSTEM + list(APPEND addSubdirectoryExtraArgs SYSTEM) + endif() + if(OPTIONS) + foreach(OPTION ${OPTIONS}) + cpm_parse_option("${OPTION}") + set(${OPTION_KEY} "${OPTION_VALUE}") + endforeach() + endif() + set(CPM_OLD_INDENT "${CPM_INDENT}") + set(CPM_INDENT "${CPM_INDENT} ${PACKAGE}:") + add_subdirectory(${SOURCE_DIR} ${BINARY_DIR} ${addSubdirectoryExtraArgs}) + set(CPM_INDENT "${CPM_OLD_INDENT}") + endif() +endfunction() + +# downloads a previously declared package via FetchContent and exports the variables +# `${PACKAGE}_SOURCE_DIR` and `${PACKAGE}_BINARY_DIR` to the parent scope +function(cpm_fetch_package PACKAGE DOWNLOAD_ONLY populated) + set(${populated} + FALSE + PARENT_SCOPE + ) + if(${CPM_DRY_RUN}) + cpm_message(STATUS "${CPM_INDENT} Package ${PACKAGE} not fetched (dry run)") + return() + endif() + + FetchContent_GetProperties(${PACKAGE}) + + string(TOLOWER "${PACKAGE}" lower_case_name) + + if(NOT ${lower_case_name}_POPULATED) + if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.30.3") + if(DOWNLOAD_ONLY) + # MakeAvailable will call add_subdirectory internally which is not what we want when + # DOWNLOAD_ONLY is set. Populate will only download the dependency without adding it to the + # build + FetchContent_Populate( + ${PACKAGE} + SOURCE_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-src" + BINARY_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build" + SUBBUILD_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild" + ${ARGN} + ) + else() + FetchContent_MakeAvailable(${PACKAGE}) + endif() + else() + FetchContent_Populate(${PACKAGE}) + endif() + set(${populated} + TRUE + PARENT_SCOPE + ) + endif() + + cpm_store_fetch_properties( + ${CPM_ARGS_NAME} ${${lower_case_name}_SOURCE_DIR} ${${lower_case_name}_BINARY_DIR} + ) + + set(${PACKAGE}_SOURCE_DIR + ${${lower_case_name}_SOURCE_DIR} + PARENT_SCOPE + ) + set(${PACKAGE}_BINARY_DIR + ${${lower_case_name}_BINARY_DIR} + PARENT_SCOPE + ) +endfunction() + +# splits a package option +function(cpm_parse_option OPTION) + string(REGEX MATCH "^[^ ]+" OPTION_KEY "${OPTION}") + string(LENGTH "${OPTION}" OPTION_LENGTH) + string(LENGTH "${OPTION_KEY}" OPTION_KEY_LENGTH) + if(OPTION_KEY_LENGTH STREQUAL OPTION_LENGTH) + # no value for key provided, assume user wants to set option to "ON" + set(OPTION_VALUE "ON") + else() + math(EXPR OPTION_KEY_LENGTH "${OPTION_KEY_LENGTH}+1") + string(SUBSTRING "${OPTION}" "${OPTION_KEY_LENGTH}" "-1" OPTION_VALUE) + endif() + set(OPTION_KEY + "${OPTION_KEY}" + PARENT_SCOPE + ) + set(OPTION_VALUE + "${OPTION_VALUE}" + PARENT_SCOPE + ) +endfunction() + +# guesses the package version from a git tag +function(cpm_get_version_from_git_tag GIT_TAG RESULT) + string(LENGTH ${GIT_TAG} length) + if(length EQUAL 40) + # GIT_TAG is probably a git hash + set(${RESULT} + 0 + PARENT_SCOPE + ) + else() + string(REGEX MATCH "v?([0123456789.]*).*" _ ${GIT_TAG}) + set(${RESULT} + ${CMAKE_MATCH_1} + PARENT_SCOPE + ) + endif() +endfunction() + +# guesses if the git tag is a commit hash or an actual tag or a branch name. +function(cpm_is_git_tag_commit_hash GIT_TAG RESULT) + string(LENGTH "${GIT_TAG}" length) + # full hash has 40 characters, and short hash has at least 7 characters. + if(length LESS 7 OR length GREATER 40) + set(${RESULT} + 0 + PARENT_SCOPE + ) + else() + if(${GIT_TAG} MATCHES "^[a-fA-F0-9]+$") + set(${RESULT} + 1 + PARENT_SCOPE + ) + else() + set(${RESULT} + 0 + PARENT_SCOPE + ) + endif() + endif() +endfunction() + +function(cpm_prettify_package_arguments OUT_VAR IS_IN_COMMENT) + set(oneValueArgs + NAME + FORCE + VERSION + GIT_TAG + DOWNLOAD_ONLY + GITHUB_REPOSITORY + GITLAB_REPOSITORY + BITBUCKET_REPOSITORY + GIT_REPOSITORY + SOURCE_DIR + FIND_PACKAGE_ARGUMENTS + NO_CACHE + SYSTEM + GIT_SHALLOW + EXCLUDE_FROM_ALL + SOURCE_SUBDIR + ) + set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND) + cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + foreach(oneArgName ${oneValueArgs}) + if(DEFINED CPM_ARGS_${oneArgName}) + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + if(${oneArgName} STREQUAL "SOURCE_DIR") + string(REPLACE ${CMAKE_SOURCE_DIR} "\${CMAKE_SOURCE_DIR}" CPM_ARGS_${oneArgName} + ${CPM_ARGS_${oneArgName}} + ) + endif() + string(APPEND PRETTY_OUT_VAR " ${oneArgName} ${CPM_ARGS_${oneArgName}}\n") + endif() + endforeach() + foreach(multiArgName ${multiValueArgs}) + if(DEFINED CPM_ARGS_${multiArgName}) + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + string(APPEND PRETTY_OUT_VAR " ${multiArgName}\n") + foreach(singleOption ${CPM_ARGS_${multiArgName}}) + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + string(APPEND PRETTY_OUT_VAR " \"${singleOption}\"\n") + endforeach() + endif() + endforeach() + + if(NOT "${CPM_ARGS_UNPARSED_ARGUMENTS}" STREQUAL "") + if(${IS_IN_COMMENT}) + string(APPEND PRETTY_OUT_VAR "#") + endif() + string(APPEND PRETTY_OUT_VAR " ") + foreach(CPM_ARGS_UNPARSED_ARGUMENT ${CPM_ARGS_UNPARSED_ARGUMENTS}) + string(APPEND PRETTY_OUT_VAR " ${CPM_ARGS_UNPARSED_ARGUMENT}") + endforeach() + string(APPEND PRETTY_OUT_VAR "\n") + endif() + + set(${OUT_VAR} + ${PRETTY_OUT_VAR} + PARENT_SCOPE + ) + +endfunction() diff --git a/docs/Makefile b/docs/Makefile index 6f374b16..b2a6d356 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -12,6 +12,7 @@ DOXYGEN := $(shell command -v doxygen 2> /dev/null) PYTHON ?= python3 GENDIR = $(SOURCEDIR)/_generated +APIDIR = $(SOURCEDIR)/cpp_api/api # Put it first so that "make" without argument is like "make help". help: @@ -24,6 +25,7 @@ clean: rm -rf $(BUILDDIR) rm -rf doxygen rm -rf $(GENDIR) + rm -rf $(APIDIR) # Generate Doxygen documentation (only if doxygen is installed) doxygen: diff --git a/docs/scripts/generate_api_index.py b/docs/scripts/generate_api_index.py index 61e07a30..08404bf0 100644 --- a/docs/scripts/generate_api_index.py +++ b/docs/scripts/generate_api_index.py @@ -22,13 +22,13 @@ from __future__ import annotations import argparse -import re +import os +import subprocess import xml.etree.ElementTree as ET from collections import defaultdict from dataclasses import dataclass, field from pathlib import Path - # Root namespace prefix - everything under this is considered public API ROOT_NS = "dftracer::utils" @@ -45,7 +45,7 @@ GUIDE_PAGES: dict[str, str] = { "coro": "coro", "io": "io", - "sqlite": "sqlite", + "rocksdb": "rocksdb", "task_graph": "task_graph", "utilities.common.arrow": "arrow", "utilities.indexer": "indexer", @@ -58,7 +58,7 @@ TITLE_OVERRIDES: dict[str, str] = { "coro": "Coroutine Primitives", "io": "Async I/O", - "sqlite": "SQLite", + "rocksdb": "RocksDB", "task_graph": "Task Graph", "server": "HTTP Server", "call_tree": "Call Tree", @@ -86,7 +86,6 @@ } - def is_inner_type(name: str, all_names: set[str]) -> bool: """Check if a name is an inner/nested type of another class. @@ -108,6 +107,10 @@ class APIItem: file: str = "" brief: str = "" is_inner: bool = False + line: int | None = None + bodyfile: str = "" + bodystart: int | None = None + bodyend: int | None = None @dataclass @@ -127,8 +130,7 @@ def parse_doxygen_xml(xml_dir: Path) -> list[APIItem]: index_path = xml_dir / "index.xml" if not index_path.exists(): raise FileNotFoundError( - f"{index_path} not found. Run doxygen first:\n" - f" cd docs && doxygen Doxyfile" + f"{index_path} not found. Run doxygen first:\n cd docs && doxygen Doxyfile" ) tree = ET.parse(index_path) @@ -159,6 +161,10 @@ def parse_doxygen_xml(xml_dir: Path) -> list[APIItem]: file_path = "" brief = "" + line = None + bodyfile = "" + bodystart = None + bodyend = None detail_xml = xml_dir / f"{refid}.xml" if detail_xml.exists(): try: @@ -167,6 +173,21 @@ def parse_doxygen_xml(xml_dir: Path) -> list[APIItem]: loc = droot.find(".//location") if loc is not None: file_path = loc.get("file", "") + bodyfile = loc.get("bodyfile", "") + line_attr = loc.get("line") + bodystart_attr = loc.get("bodystart") + bodyend_attr = loc.get("bodyend") + line = int(line_attr) if line_attr and line_attr.isdigit() else None + bodystart = ( + int(bodystart_attr) + if bodystart_attr and bodystart_attr.isdigit() + else None + ) + bodyend = ( + int(bodyend_attr) + if bodyend_attr and bodyend_attr.isdigit() + else None + ) bd = droot.find(".//briefdescription/para") if bd is not None and bd.text: brief = bd.text.strip() @@ -186,6 +207,10 @@ def parse_doxygen_xml(xml_dir: Path) -> list[APIItem]: refid=refid, file=file_path, brief=brief, + line=line, + bodyfile=bodyfile, + bodystart=bodystart, + bodyend=bodyend, ) ) @@ -264,9 +289,7 @@ def discover_modules(items: list[APIItem]) -> list[Module]: # Build Module objects modules: list[Module] = [] for ns_suffix in sorted(final.keys()): - full_ns = ( - f"{ROOT_NS}::{ns_suffix.replace('.', '::')}" if ns_suffix else ROOT_NS - ) + full_ns = f"{ROOT_NS}::{ns_suffix.replace('.', '::')}" if ns_suffix else ROOT_NS title = TITLE_OVERRIDES.get(ns_suffix, _auto_title(ns_suffix)) filename = _ns_to_filename(ns_suffix) guide_page = GUIDE_PAGES.get(ns_suffix) @@ -309,7 +332,101 @@ def _ns_to_filename(ns_suffix: str) -> str: return ns_suffix.replace(".", "/") -def generate_module_rst(mod: Module) -> str: +def detect_repo_url(repo_root: Path) -> str: + """Detect the GitHub repository URL for source links.""" + repo = os.environ.get("READTHEDOCS_GIT_REPOSITORY") + if repo: + repo = repo.removesuffix(".git") + if repo.startswith("git@github.com:"): + return repo.replace("git@github.com:", "https://github.com/", 1) + if repo.startswith("https://github.com/"): + return repo + if repo.startswith("github.com/"): + return f"https://{repo}" + + repo = os.environ.get("GITHUB_REPOSITORY") + if repo: + return f"https://github.com/{repo}" + + try: + remote = ( + subprocess.check_output( + ["git", "remote", "get-url", "origin"], + cwd=repo_root, + text=True, + ) + .strip() + .removesuffix(".git") + ) + if remote.startswith("git@github.com:"): + return remote.replace("git@github.com:", "https://github.com/", 1) + if remote.startswith("https://github.com/"): + return remote + except Exception: + pass + + return "https://github.com/LLNL/dftracer-utils" + + +def detect_source_ref(repo_root: Path) -> str: + """Detect the git ref used for source links.""" + for env_name in ("READTHEDOCS_GIT_COMMIT_HASH", "GITHUB_SHA"): + value = os.environ.get(env_name) + if value: + return value + + try: + return ( + subprocess.check_output( + ["git", "rev-parse", "HEAD"], + cwd=repo_root, + text=True, + ) + .strip() + ) + except Exception: + return "develop" + + +def resolve_repo_path(repo_root: Path, item: APIItem) -> str | None: + """Resolve a Doxygen location path to a repo-relative source file.""" + candidates = [] + if item.bodyfile: + candidates.append(item.bodyfile) + if item.file: + candidates.append(item.file) + + for candidate in candidates: + rel = Path(candidate) + for base in (repo_root / "include", repo_root / "src"): + full = base / rel + if full.exists(): + return full.relative_to(repo_root).as_posix() + return None + + +def source_link(repo_root: Path, repo_url: str, source_ref: str, item: APIItem) -> str | None: + """Build a GitHub source link for an API item.""" + rel = resolve_repo_path(repo_root, item) + if rel is None: + return None + + start = item.bodystart or item.line + end = item.bodyend or start + url = f"{repo_url}/blob/{source_ref}/{rel}" + if start is not None: + url += f"#L{start}" + if end is not None and end != start: + url += f"-L{end}" + return url + + +def generate_module_rst( + mod: Module, + repo_root: Path, + repo_url: str, + source_ref: str, +) -> str: """Generate RST for a single module page.""" mod.items.sort(key=lambda x: (x.is_inner, x.name)) @@ -320,15 +437,19 @@ def generate_module_rst(mod: Module) -> str: lines.append(f"Namespace: ``{mod.full_ns}``") lines.append("") if mod.guide_page: - lines.append( - f"For usage guide and examples, see :doc:`/cpp_api/{mod.guide_page}`." - ) + lines.append(f"For usage guide and examples, see :doc:`/cpp_api/{mod.guide_page}`.") lines.append("") top_level = [i for i in mod.items if not i.is_inner] for item in top_level: directive = "doxygenclass" if item.kind == "class" else "doxygenstruct" + link = source_link(repo_root, repo_url, source_ref, item) + if link: + lines.append(f".. rst-class:: api-source-link") + lines.append("") + lines.append(f" `source <{link}>`_") + lines.append("") lines.append(f".. {directive}:: {item.name}") lines.append(" :project: dftracer-utils") lines.append(" :members:") @@ -360,7 +481,7 @@ def _build_toctree_hierarchy( # Register all ancestor directories segments = parent_dir.split("/") for i in range(len(segments)): - ancestor = "/".join(segments[: i]) + ancestor = "/".join(segments[:i]) child = "/".join(segments[: i + 1]) dirs[ancestor].add(child) @@ -427,13 +548,13 @@ def _generate_dir_index( # Subdirectories (link to their index) child_dirs = sorted(dirs.get(dir_path, set())) for child in child_dirs: - rel = child[len(dir_path):].lstrip("/") if dir_path else child + rel = child[len(dir_path) :].lstrip("/") if dir_path else child entries.append(f"{rel}/index") # Leaf modules in this directory leaves = sorted(dir_leaves.get(dir_path, []), key=lambda m: m.filename) for mod in leaves: - rel = mod.filename[len(dir_path):].lstrip("/") if dir_path else mod.filename + rel = mod.filename[len(dir_path) :].lstrip("/") if dir_path else mod.filename entries.append(rel) if entries: @@ -481,6 +602,9 @@ def _generate_dir_index( def generate(xml_dir: Path, output_dir: Path) -> None: """Main generation entry point. Called by conf.py or CLI.""" + repo_root = output_dir.parents[3] + repo_url = detect_repo_url(repo_root) + source_ref = detect_source_ref(repo_root) items = parse_doxygen_xml(xml_dir) print(f" Found {len(items)} public API items") @@ -488,12 +612,19 @@ def generate(xml_dir: Path, output_dir: Path) -> None: # Generate per-module pages output_dir.mkdir(parents=True, exist_ok=True) + expected_paths = {output_dir / f"{mod.filename}.rst" for mod in modules} for mod in modules: - rst = generate_module_rst(mod) + rst = generate_module_rst(mod, repo_root, repo_url, source_ref) out_path = output_dir / f"{mod.filename}.rst" out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(rst) + for stale in output_dir.rglob("*.rst"): + if stale.name == "index.rst": + continue + if stale not in expected_paths: + stale.unlink() + # Generate index pages at each directory level generate_index_rst(modules, output_dir) diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css new file mode 100644 index 00000000..236a5b96 --- /dev/null +++ b/docs/source/_static/custom.css @@ -0,0 +1,19 @@ +.api-source-link { + float: right; + margin: 0 0 0.35rem 1rem; + font-size: 0.9rem; + line-height: 1.2; +} + +.api-source-link a::before { + content: "["; +} + +.api-source-link a::after { + content: "]"; +} + +.api-source-link + dl.cpp, +.api-source-link + dl.c { + margin-top: 0; +} diff --git a/docs/source/api/indexer.rst b/docs/source/api/indexer.rst index 0a83c7ba..6be94a3e 100644 --- a/docs/source/api/indexer.rst +++ b/docs/source/api/indexer.rst @@ -1,12 +1,13 @@ Indexer Module ============== -The indexer module provides functionality for indexing and searching gzip trace files. +The indexer module provides functionality for indexing and searching gzip trace +files using a root-local ``.dftindex`` store. Indexer Class ------------- -.. autoclass:: dftracer.utils.Indexer(gz_path: str, idx_path: str | None = None, checkpoint_size: int = 1048576, force_rebuild: bool = False, build_bloom: bool = False, build_manifest: bool = False, index_threshold: int = 8388608, runtime: Runtime | None = None) +.. autoclass:: dftracer.utils.Indexer(gz_path: str, index_path: str | None = None, checkpoint_size: int = 1048576, force_rebuild: bool = False, build_bloom: bool = False, build_manifest: bool = False, index_threshold: int = 8388608, runtime: Runtime | None = None) :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/trace_reader.rst b/docs/source/api/trace_reader.rst index 4c9ac7dd..ee9fc936 100644 --- a/docs/source/api/trace_reader.rst +++ b/docs/source/api/trace_reader.rst @@ -2,7 +2,8 @@ TraceReader Module ================== The ``TraceReader`` is the recommended way to read trace files. It auto-selects -sequential or indexed reading based on whether an ``.idx`` sidecar exists. +sequential or indexed reading based on whether a root-local ``.dftindex`` +RocksDB store exists. TraceReader Class ----------------- @@ -74,7 +75,7 @@ File Metadata ------------- ``get_max_bytes()`` and ``get_num_lines()`` return file metadata without -reading the full file (when an index exists): +reading the full file (when a ``.dftindex`` RocksDB index store exists): .. code-block:: python diff --git a/docs/source/conf.py b/docs/source/conf.py index ff04fcc3..e46e120a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -4,13 +4,14 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html import os +import inspect +import importlib import subprocess import sys +import types from pathlib import Path - -# Don't add project root to path - we want to use the installed package from site-packages -# If we add the project root, Python will try to import from source which doesn't have the compiled .so -# sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +from types import TracebackType +from typing import Iterator # Auto-generate Mermaid class diagrams from Doxygen XML before building _docs_dir = Path(__file__).parent.parent # docs/ @@ -48,20 +49,833 @@ check=False, ) -# Mock imports for packages that may not be available during doc build +ON_READTHEDOCS = os.environ.get("READTHEDOCS", "").lower() == "true" +PYTHON_SOURCE_DIR = _docs_dir.parent / "python" autodoc_mock_imports = [] -# Try to import the package + +def _install_rtd_extension_stub() -> None: + """Install a lightweight stub for the native extension on RTD.""" + + ext_name = "dftracer.utils.dftracer_utils_ext" + if ext_name in sys.modules: + return + + ext = types.ModuleType(ext_name) + JSONPrimitive = str | int | float | bool | None + + class _BaseNative: + """RTD stub for native extension classes.""" + pass + + class TaskHandle(_BaseNative): + """Handle to a submitted task. + + Returned by asynchronous utility calls and by + :class:`dftracer.utils.Runtime`. The handle can be waited on, + queried for completion, or used to fetch the task result. + """ + + name = "" + task_id = 0 + + def get(self) -> object | None: + """Block until the task completes and return its result.""" + return None + + def wait(self) -> None: + """Block until the task completes.""" + return None + + def done(self) -> bool: + """Return ``True`` when the task has completed.""" + return True + + class Runtime(_BaseNative): + """Lightweight task runtime for native and Python work. + + The runtime owns the executor threads used by coroutine-backed + readers, indexers, and utilities. The higher-level Python + wrapper in :mod:`dftracer.utils.runtime` builds on this native + object to support Python callables and richer task tracking. + """ + + threads = 0 + + def __init__(self, threads: int = 0) -> None: + """Create a runtime with an optional worker-thread count.""" + super().__init__(threads) + self.threads = threads + + def shutdown(self) -> None: + """Stop the runtime and release worker resources.""" + return None + + def wait_all(self) -> None: + """Block until all submitted native work completes.""" + return None + + def get_progress(self) -> dict[str, object]: + """Return runtime progress metadata.""" + return {} + + def is_responsive(self) -> bool: + """Return whether the watchdog still considers the runtime healthy.""" + return True + + def set_timeout(self, global_ms: int = 0) -> None: + """Set a global watchdog timeout in milliseconds.""" + return None + + def set_default_task_timeout(self, ms: int = 0) -> None: + """Set the default per-task timeout in milliseconds.""" + return None + + def __enter__(self) -> "Runtime": + """Enter the runtime context manager.""" + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + """Exit the runtime context manager.""" + return None + + class IndexerCheckpoint(_BaseNative): + """Information about a single checkpoint in a ``.dftindex`` store. + + Checkpoints map compressed and uncompressed offsets and carry + per-chunk metadata used for seeking and chunk-level pruning. + """ + + checkpoint_idx = 0 + uc_offset = 0 + uc_size = 0 + c_offset = 0 + c_size = 0 + bits = 0 + num_lines = 0 + + class Indexer(_BaseNative): + """Build and query a root-local ``.dftindex`` RocksDB store. + + The indexer extracts checkpoints and optional bloom/manifest + data for a compressed DFTracer trace. Readers and higher-level + utilities use this store for chunk pruning and random access. + """ + + def __init__( + self, + gz_path: str, + index_path: str | None = None, + checkpoint_size: int = 1048576, + force_rebuild: bool = False, + build_bloom: bool = False, + build_manifest: bool = False, + index_threshold: int = 8388608, + runtime: Runtime | None = None, + ) -> None: + """Create an indexer for a compressed DFTracer trace.""" + self.gz_path = gz_path + self.index_path = index_path or "" + self.checkpoint_size = checkpoint_size + self.has_bloom = build_bloom + self.has_manifest = build_manifest + + def build(self) -> None: + """Build the index store for the configured trace file.""" + return None + + def need_rebuild(self) -> bool: + """Return whether the index is missing or stale.""" + return False + + def exists(self) -> bool: + """Return whether the index store already exists.""" + return False + + def get_max_bytes(self) -> int: + """Return the maximum decompressed byte position in the trace.""" + return 0 + + def get_num_lines(self) -> int: + """Return the number of lines recorded in the index.""" + return 0 + + def get_checkpoints(self) -> list["IndexerCheckpoint"]: + """Return all checkpoints stored for the trace.""" + return [] + + def find_checkpoint(self, target_offset: int) -> "IndexerCheckpoint | None": + """Return the checkpoint closest to a decompressed offset.""" + return None + + def close(self) -> None: + """Release this Python wrapper's native indexer handle.""" + return None + + def __enter__(self) -> "Indexer": + """Enter the indexer context manager.""" + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + """Exit the indexer context manager.""" + return None + + class JSON(_BaseNative): + """Lazy JSON wrapper backed by yyjson. + + Nested objects are exposed as additional :class:`JSON` wrappers + so callers can inspect large trace records without eagerly + converting the entire payload to Python dictionaries. + """ + + def __init__(self, json_str: str) -> None: + """Create a lazy JSON wrapper from a JSON string.""" + self._json_str = json_str + + def get( + self, + key: str, + default: "JSON | JSONPrimitive" = None, + ) -> "JSON | JSONPrimitive": + """Look up a key and return ``default`` when it is absent.""" + return default + + def keys(self) -> list[str]: + """Return the keys in the current JSON object.""" + return [] + + def values(self) -> list["JSON | JSONPrimitive"]: + """Return the values in the current JSON object.""" + return [] + + def items(self) -> list[tuple[str, "JSON | JSONPrimitive"]]: + """Return key-value pairs in the current JSON object.""" + return [] + + def unwrap(self) -> dict[str, object] | list[object] | JSONPrimitive: + """Convert the lazy wrapper into native Python data.""" + return {} + + def copy(self) -> "JSON": + """Return a shallow copy of the current lazy JSON wrapper.""" + return self + + def __contains__(self, key: str) -> bool: + """Return ``True`` when a key exists in the object.""" + return False + + def __getitem__(self, key: str) -> "JSON | JSONPrimitive": + """Return a field value or nested :class:`JSON` wrapper.""" + raise KeyError(key) + + def __len__(self) -> int: + """Return the number of items in the current JSON object.""" + return 0 + + def __bool__(self) -> bool: + """Return whether the current JSON value is non-empty.""" + return False + + def __str__(self) -> str: + """Return a JSON-like string representation.""" + return "{}" + + def __repr__(self) -> str: + """Return a developer-facing representation.""" + return "JSON('{}')" + + class TraceReader(_BaseNative): + """Read DFTracer traces with optional index-assisted pruning. + + ``TraceReader`` chooses between sequential and indexed access + based on the file format and the presence of a shared + root-local ``.dftindex`` store. It exposes line, raw-byte, + JSON, and Arrow-based views over the same trace data. + """ + + def __init__( + self, + file_path: str, + index_dir: str = "", + checkpoint_size: int = 33554432, + auto_build_index: bool = False, + index_threshold: int = 8388608, + runtime: Runtime | None = None, + ) -> None: + """Create a trace reader for plain or compressed DFTracer files.""" + self.file_path = file_path + self.index_dir = index_dir + self.checkpoint_size = checkpoint_size + self.auto_build_index = auto_build_index + self.index_threshold = index_threshold + + def read_lines( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: str | None = None, + ) -> list[str]: + """Materialize decoded lines into a Python list. + + Supports optional line/byte ranges and query-based filtering. + """ + return [] + + def iter_lines( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: str | None = None, + ) -> Iterator[str]: + """Stream decoded lines from the trace. + + The returned iterator yields one UTF-8 decoded line at a time. + """ + return iter(()) + + def iter_raw( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + line_aligned: bool = True, + multi_line: bool = True, + buffer_size: int = 4194304, + query: str | None = None, + ) -> Iterator[bytes]: + """Stream raw byte chunks from the trace. + + Byte-range reads can be aligned to line boundaries and can + optionally return multi-line chunks. + """ + return iter(()) + + def read_raw( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + line_aligned: bool = True, + multi_line: bool = True, + buffer_size: int = 4194304, + query: str | None = None, + ) -> list[bytes]: + """Materialize raw byte chunks into a Python list.""" + return [] + + def iter_lines_json( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: str | None = None, + ) -> Iterator["JSON"]: + """Stream lazy :class:`JSON` objects for trace events.""" + return iter(()) + + def read_lines_json( + self, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: str | None = None, + ) -> list["JSON"]: + """Materialize trace events as lazy :class:`JSON` objects.""" + return [] + + def iter_arrow( + self, + batch_size: int = 10000, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: str | None = None, + ) -> Iterator["ArrowBatch"]: + """Stream Arrow batches parsed from trace events.""" + return iter(()) + + def read_arrow( + self, + batch_size: int = 10000, + start_line: int = 0, + end_line: int = 0, + start_byte: int = 0, + end_byte: int = 0, + buffer_size: int = 4194304, + query: str | None = None, + ) -> "ArrowTable | None": + """Materialize Arrow batches as a single table-like result.""" + return None + + def get_max_bytes(self) -> int: + """Return indexed decompressed size when available.""" + return 0 + + def get_num_lines(self) -> int: + """Return indexed line count when available.""" + return 0 + + def __enter__(self) -> "TraceReader": + """Enter the trace-reader context manager.""" + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + """Exit the trace-reader context manager.""" + return None + + class AggregatorUtility(_BaseNative): + """Aggregate trace events into Arrow-ready time buckets.""" + + def __init__(self, runtime: Runtime | None = None) -> None: + """Create an aggregation utility bound to an optional runtime.""" + self.runtime = runtime + + def process( + self, + source_dir: str, + output_path: str = "", + time_interval_ms: float = 5000.0, + query: str = "", + index_dir: str = "", + force_rebuild: bool = False, + custom_metric_fields: list[str] | None = None, + compute_percentiles: bool = False, + ) -> "ArrowTable | None": + """Aggregate trace events into a materialized Arrow-style result.""" + return None + + def iter_arrow( + self, + source_dir: str, + output_path: str = "", + time_interval_ms: float = 5000.0, + query: str = "", + index_dir: str = "", + force_rebuild: bool = False, + custom_metric_fields: list[str] | None = None, + compute_percentiles: bool = False, + ) -> Iterator["ArrowBatch"]: + """Stream Arrow batches for aggregated trace metrics.""" + return iter(()) + + class ComparatorUtility(_BaseNative): + """Compare baseline and variant traces.""" + + def __init__(self, runtime: Runtime | None = None) -> None: + """Create a comparator utility bound to an optional runtime.""" + self.runtime = runtime + + def compare( + self, + baseline: str, + variant: str, + query: str = "", + time_interval_ms: float = 5000.0, + threshold: float = 0.0, + index_dir: str = "", + force_rebuild: bool = False, + ) -> "ArrowTable | None": + """Return comparison results as Arrow-compatible output.""" + return None + + def compare_json( + self, + baseline: str, + variant: str, + query: str = "", + time_interval_ms: float = 5000.0, + threshold: float = 0.0, + index_dir: str = "", + force_rebuild: bool = False, + ) -> str: + """Return comparison results as JSON.""" + return "{}" + + def compare_table( + self, + baseline: str, + variant: str, + query: str = "", + time_interval_ms: float = 5000.0, + threshold: float = 0.0, + index_dir: str = "", + force_rebuild: bool = False, + ) -> str: + """Return comparison results as a formatted text table.""" + return "" + + class StatisticsQueryUtility(_BaseNative): + """Query summary or top-N statistics from a trace.""" + + def __init__(self, runtime: Runtime | None = None) -> None: + """Create a statistics-query utility bound to an optional runtime.""" + self.runtime = runtime + + def process( + self, + file_path: str, + query_type: str = "summary", + top_n: int = 10, + index_dir: str = "", + auto_build_index: bool = False, + index_threshold: int = 8388608, + ) -> dict[str, object]: + """Return scalar statistics derived from the trace.""" + return {} + + class StatisticsAggregatorUtility(_BaseNative): + """Aggregate core statistics from a trace into a Python dictionary.""" + + def __init__(self, runtime: Runtime | None = None) -> None: + """Create a statistics-aggregator utility bound to an optional runtime.""" + self.runtime = runtime + + def process( + self, + file_path: str, + index_dir: str = "", + auto_build_index: bool = False, + index_threshold: int = 8388608, + ) -> dict[str, object]: + """Return aggregate trace statistics.""" + return {} + + class MetadataCollectorUtility(_BaseNative): + """Collect file metadata and index-aware trace metadata.""" + + def __init__(self, runtime: Runtime | None = None) -> None: + """Create a metadata collector bound to an optional runtime.""" + self.runtime = runtime + + def process( + self, + file_path: str, + index_dir: str = "", + checkpoint_size: int = 33554432, + force_rebuild: bool = False, + index_threshold: int = 8388608, + ) -> dict[str, object]: + """Return metadata for a DFTracer trace file.""" + return {} + + class ReorganizationPlannerUtility(_BaseNative): + """Build a semantic reorganization plan for trace files.""" + + def __init__(self, runtime: Runtime | None = None) -> None: + """Create a reorganization planner bound to an optional runtime.""" + self.runtime = runtime + + def process( + self, + source_files: list[str], + groups: list[dict[str, object]], + index_dir: str = "", + checkpoint_size: int = 33554432, + force_rebuild: bool = False, + index_threshold: int = 8388608, + ) -> dict[str, object]: + """Return a reorganization plan for the requested groups.""" + return {} + + class ReconstructionPlannerUtility(_BaseNative): + """Build a reconstruction plan from reorganized traces.""" + + def __init__(self, runtime: Runtime | None = None) -> None: + """Create a reconstruction planner bound to an optional runtime.""" + self.runtime = runtime + + def process( + self, + reorganized_files: list[str], + provenance_dir: str = "", + ) -> dict[str, object]: + """Return a reconstruction plan for reorganized trace files.""" + return {} + + ext.Indexer = Indexer + ext.IndexerCheckpoint = IndexerCheckpoint + ext.JSON = JSON + ext.Runtime = Runtime + ext.TaskHandle = TaskHandle + ext.TraceReader = TraceReader + ext.AggregatorUtility = AggregatorUtility + ext.ComparatorUtility = ComparatorUtility + ext.MetadataCollectorUtility = MetadataCollectorUtility + ext.ReconstructionPlannerUtility = ReconstructionPlannerUtility + ext.ReorganizationPlannerUtility = ReorganizationPlannerUtility + ext.StatisticsAggregatorUtility = StatisticsAggregatorUtility + ext.StatisticsQueryUtility = StatisticsQueryUtility + + def get_default_runtime() -> Runtime: + """Return the process-wide default runtime.""" + return Runtime() + + def set_default_runtime(runtime: Runtime | None = None) -> None: + """Replace or clear the process-wide default runtime.""" + return None + + ext.get_default_runtime = get_default_runtime + ext.set_default_runtime = set_default_runtime + for name in [ + "AggregatorUtility", + "ComparatorUtility", + "Indexer", + "IndexerCheckpoint", + "JSON", + "MetadataCollectorUtility", + "ReconstructionPlannerUtility", + "ReorganizationPlannerUtility", + "Runtime", + "StatisticsAggregatorUtility", + "StatisticsQueryUtility", + "TaskHandle", + "TraceReader", + ]: + getattr(ext, name).__module__ = ext_name + ext.__all__ = [ + "AggregatorUtility", + "ComparatorUtility", + "Indexer", + "IndexerCheckpoint", + "JSON", + "MetadataCollectorUtility", + "ReconstructionPlannerUtility", + "ReorganizationPlannerUtility", + "Runtime", + "StatisticsAggregatorUtility", + "StatisticsQueryUtility", + "TaskHandle", + "TraceReader", + "get_default_runtime", + "set_default_runtime", + ] + sys.modules[ext_name] = ext + + +def _repo_url() -> str: + """Return the GitHub repository URL used for source links.""" + repo = os.environ.get("READTHEDOCS_GIT_REPOSITORY") + if repo: + repo = repo.removesuffix(".git") + if repo.startswith("git@github.com:"): + repo = repo.replace("git@github.com:", "https://github.com/", 1) + elif repo.startswith("https://github.com/"): + return repo + if repo.startswith("github.com/"): + return f"https://{repo}" + + repo = os.environ.get("GITHUB_REPOSITORY") + if repo: + return f"https://github.com/{repo}" + + try: + remote = ( + subprocess.check_output( + ["git", "remote", "get-url", "origin"], + cwd=_docs_dir.parent, + text=True, + ) + .strip() + .removesuffix(".git") + ) + if remote.startswith("git@github.com:"): + return remote.replace("git@github.com:", "https://github.com/", 1) + if remote.startswith("https://github.com/"): + return remote + except Exception: + pass + + return "https://github.com/LLNL/dftracer-utils" + + +def _source_ref() -> str: + """Return the git ref used for source links.""" + for env_name in ("READTHEDOCS_GIT_COMMIT_HASH", "GITHUB_SHA"): + value = os.environ.get(env_name) + if value: + return value + try: + return ( + subprocess.check_output( + ["git", "rev-parse", "HEAD"], + cwd=_docs_dir.parent, + text=True, + ) + .strip() + ) + except Exception: + return "develop" + +REPO_URL = _repo_url() +SOURCE_REF = _source_ref() + + +def _pyi_target_for_extension(fullname: str) -> tuple[Path, list[str]] | None: + """Map extension-exported objects to their public type-stub file.""" + top = fullname.split(".", 1)[0] + utility_map = { + "AggregatorUtility": "python/dftracer/utils/utilities/_aggregator.pyi", + "ComparatorUtility": "python/dftracer/utils/utilities/_comparator.pyi", + "MetadataCollectorUtility": ( + "python/dftracer/utils/utilities/_metadata_collector.pyi" + ), + "StatisticsQueryUtility": ( + "python/dftracer/utils/utilities/_statistics_query.pyi" + ), + "StatisticsAggregatorUtility": ( + "python/dftracer/utils/utilities/_statistics_aggregator.pyi" + ), + "ReorganizationPlannerUtility": ( + "python/dftracer/utils/utilities/_reorganization_planner.pyi" + ), + "ReconstructionPlannerUtility": ( + "python/dftracer/utils/utilities/_reconstruction_planner.pyi" + ), + } + rel_path = utility_map.get(top, "python/dftracer/utils/dftracer_utils_ext.pyi") + return (_docs_dir.parent / rel_path, fullname.split(".")) + + +def _find_symbol_lines(path: Path, parts: list[str]) -> tuple[int, int] | None: + """Find source lines for a class/function/method in a Python source or stub file.""" + try: + tree = ast.parse(path.read_text()) + except Exception: + return None + + node = tree + current_body = tree.body + for part in parts: + match = None + for child in current_body: + if isinstance(child, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)): + if child.name == part: + match = child + break + if match is None: + return None + node = match + current_body = getattr(match, "body", []) + + start = getattr(node, "lineno", None) + end = getattr(node, "end_lineno", start) + if start is None: + return None + return (start, end or start) + + +def _github_url(path: Path, lines: tuple[int, int] | None) -> str | None: + """Build a GitHub blob URL for a repo-relative path and optional lines.""" + try: + rel = path.resolve().relative_to(_docs_dir.parent.resolve()).as_posix() + except Exception: + return None + url = f"{REPO_URL}/blob/{SOURCE_REF}/{rel}" + if lines is not None: + start, end = lines + url += f"#L{start}" + if end != start: + url += f"-L{end}" + return url + + +def linkcode_resolve(domain: str, info: dict[str, str]) -> str | None: + """Resolve Python objects to GitHub source links.""" + if domain != "py": + return None + + module_name = info.get("module") + fullname = info.get("fullname") + if not module_name or not fullname: + return None + + try: + module = importlib.import_module(module_name) + except Exception: + return None + + obj = module + for part in fullname.split("."): + obj = getattr(obj, part, None) + if obj is None: + return None + + obj_module = getattr(obj, "__module__", module_name) + if obj_module == "dftracer.utils.dftracer_utils_ext": + target = _pyi_target_for_extension(fullname) + if target is None: + return None + path, parts = target + lines = _find_symbol_lines(path, parts) + return _github_url(path, lines) + + try: + source_file = Path(inspect.getsourcefile(obj) or inspect.getfile(obj)) + _, start = inspect.getsourcelines(obj) + end = start + max(len(inspect.getsource(obj).splitlines()) - 1, 0) + return _github_url(source_file, (start, end)) + except Exception: + return None + + +if ON_READTHEDOCS: + sys.path.insert(0, str(PYTHON_SOURCE_DIR)) + _install_rtd_extension_stub() + autodoc_mock_imports = [ + "pyarrow", + "dask", + "dask.distributed", + ] + try: import dftracer.utils print("✓ dftracer.utils package found and imported successfully.") except (ImportError, ModuleNotFoundError) as e: - print(f"Warning: dftracer.utils package not found: {e}") - print("API documentation will have limited information.") - print("To generate full API docs, install the package: pip install -e .") - # Don't mock - let it fail to show what's missing - # autodoc_mock_imports = ['dftracer', 'dftracer.utils'] + if not ON_READTHEDOCS and PYTHON_SOURCE_DIR.exists(): + print(f"Warning: installed dftracer.utils package not found: {e}") + print("Falling back to source package with RTD extension stubs.") + sys.path.insert(0, str(PYTHON_SOURCE_DIR)) + _install_rtd_extension_stub() + autodoc_mock_imports = [ + "pyarrow", + "dask", + "dask.distributed", + ] + import dftracer.utils + else: + print(f"Warning: dftracer.utils package not found: {e}") + print("API documentation will have limited information.") + print("To generate full API docs, install the package: pip install -e .") # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information @@ -160,6 +974,7 @@ html_theme = "furo" html_static_path = ["_static"] +html_css_files = ["custom.css"] # Search configuration html_search_language = "en" diff --git a/docs/source/cpp_api/coro.rst b/docs/source/cpp_api/coro.rst index 462f9a34..8156f59a 100644 --- a/docs/source/cpp_api/coro.rst +++ b/docs/source/cpp_api/coro.rst @@ -11,7 +11,28 @@ C++20 coroutine primitives for asynchronous task execution. All classes are in t For usage examples and task scheduling, see :doc:`/pipeline` and :doc:`pipeline/tasks`. -.. mermaid:: ../_generated/coro.mmd +.. mermaid:: + + graph TD + Coro["Coro\nfire-and-forget primitive"] + CoroTask["CoroTask<T>\nawaitable task result"] + SpawnFuture["SpawnFuture<T>\nresult of CoroScope::spawn()"] + JoinHandle["JoinHandle\nstructured join barrier"] + Channel["Channel<T>\nasync producer/consumer queue"] + Producer["ChannelProducer / ProducerGuard\nproducer lifetime management"] + Generator["Generator<T>\nsynchronous lazy sequence"] + AsyncGenerator["AsyncGenerator<T>\nasynchronous lazy sequence"] + Yield["yield() / maybe_yield()\ncooperative scheduling"] + + CoroTask --> Coro + SpawnFuture --> CoroTask + JoinHandle --> Coro + Channel --> Producer + CoroTask --> Channel + Coro --> Yield + CoroTask --> Yield + AsyncGenerator --> CoroTask + Generator --> Coro Coro ---- @@ -457,4 +478,3 @@ Usage example: } else { process(result.result); } - diff --git a/docs/source/cpp_api/index.rst b/docs/source/cpp_api/index.rst index 88f4190c..eeb614cb 100644 --- a/docs/source/cpp_api/index.rst +++ b/docs/source/cpp_api/index.rst @@ -12,6 +12,7 @@ This section contains the C++ API documentation for dftracer utilities. :caption: C++ Components: core_infrastructure + rocksdb reader indexer pipeline @@ -20,7 +21,6 @@ This section contains the C++ API documentation for dftracer utilities. utilities arrow io - sqlite scheduler dft_aggregators dft_indexing @@ -39,7 +39,6 @@ The dftracer utilities C++ library is organized into several namespaces: - ``dftracer::utils::task_graph`` - DAG-based task graph builder - ``dftracer::utils::utilities`` - Composable processing utilities - ``dftracer::utils::io`` - Async I/O backends (io_uring, kqueue, thread pool) -- ``dftracer::utils::sqlite`` - Async SQLite database operations - ``dftracer::utils::utilities::composites::dft::aggregators`` - Event aggregation pipeline - ``dftracer::utils::utilities::composites::dft::indexing`` - Bloom filter indexing system - ``dftracer::utils::utilities::common::arrow`` - Arrow data interchange (RecordBatchBuilder, IpcWriter) @@ -67,11 +66,6 @@ The dftracer utilities C++ library is organized into several namespaces: IoAwaitable["IoAwaitable"] end - subgraph SQLite["dftracer::utils::sqlite"] - SqliteDB["SqliteDatabase"] - SqliteAwait["SqliteAwaitable"] - end - subgraph Utilities["dftracer::utils::utilities"] Reader["Reader"] Indexer["Indexer"] @@ -100,7 +94,6 @@ The dftracer utilities C++ library is organized into several namespaces: Pipeline --> Executor Pipeline --> Scheduler Executor --> IoBackend - Executor --> SqliteDB Executor --> CoroTask Watchdog --> Executor TimerService --> Executor diff --git a/docs/source/cpp_api/pipeline/executors.rst b/docs/source/cpp_api/pipeline/executors.rst index a4d5144f..12f66dd0 100644 --- a/docs/source/cpp_api/pipeline/executors.rst +++ b/docs/source/cpp_api/pipeline/executors.rst @@ -101,7 +101,7 @@ timeout thresholds. All fields have sensible defaults. std::size_t io_pool_size = 4; io::IoBackendType io_backend_type = io::IoBackendType::AUTO; unsigned io_batch_threshold = 16; - std::size_t sqlite_pool_size = 2; + std::size_t db_pool_size = 2; }; **Key fields:** @@ -116,8 +116,7 @@ timeout thresholds. All fields have sensible defaults. (``AUTO``, ``IO_URING``, ``THREAD_POOL``). - ``io_batch_threshold`` -- Minimum number of I/O operations to batch before submitting to the backend. -- ``sqlite_pool_size`` -- Number of threads in the dedicated SQLite connection - pool. +- ``db_pool_size`` -- Number of threads in the dedicated database work pool. **Example -- high-throughput configuration:** @@ -128,7 +127,7 @@ timeout thresholds. All fields have sensible defaults. .io_pool_size = 8, .io_backend_type = io::IoBackendType::IO_URING, .io_batch_threshold = 32, - .sqlite_pool_size = 4 + .db_pool_size = 4 }; Progress Tracking diff --git a/docs/source/cpp_api/rocksdb.rst b/docs/source/cpp_api/rocksdb.rst new file mode 100644 index 00000000..76a6f77a --- /dev/null +++ b/docs/source/cpp_api/rocksdb.rst @@ -0,0 +1,35 @@ +RocksDB +======= + +The RocksDB layer provides the shared storage backend used by the +root-local ``.dftindex`` and provenance stores introduced by the +RocksDB migration. + +It includes: + +- database wrappers and lifecycle management +- async awaitables for database work on executor-backed threads +- key encoding helpers for typed prefix/range scans +- manager utilities for sharing open database handles across readers, + indexers, and higher-level composites + +Architecture +------------ + +.. mermaid:: + + graph TD + Readers["TraceReader / utilities"] --> Manager["RocksDBManager"] + Indexers["Indexer / provenance writers"] --> Manager + Manager --> Database["RocksDatabase"] + Database --> CFs["Column families"] + Database --> Async["DbAwaitable / rocks::run"] + Database --> Codec["KeyCodec"] + CFs --> Store[".dftindex / provenance store"] + Async --> Runtime["Executor-backed threads"] + Codec --> Store + +See also: + +- :doc:`api/rocksdb` +- :doc:`indexer` diff --git a/docs/source/cpp_api/sqlite.rst b/docs/source/cpp_api/sqlite.rst deleted file mode 100644 index 17597e87..00000000 --- a/docs/source/cpp_api/sqlite.rst +++ /dev/null @@ -1,337 +0,0 @@ -Async SQLite API -================ - -.. seealso:: - - For complete class and member documentation, see the - :doc:`API Reference `. - - -Asynchronous SQLite database operations integrated with the dftracer executor and coroutine system. All classes and functions are in the ``dftracer::utils::sqlite`` namespace. - -Overview --------- - -The async SQLite module provides a thin coroutine-aware wrapper around SQLite3 that allows database operations to be performed asynchronously without blocking the executor. Operations are submitted to a dedicated SQLite thread pool and the coroutine is suspended until completion. - -Key Features: - -- **Async execution**: Database operations submit work to a thread pool and suspend via ``co_await`` -- **Sync fallback**: When no executor is active, operations run synchronously inline -- **VFS integration**: Custom SQLite VFS implementation uses async I/O backend for file operations -- **Minimal overhead**: Thin wrapper on top of SQLite3; no ORM abstractions -- **Thread-safe**: All database access is serialized through the thread pool - -.. mermaid:: - - sequenceDiagram - participant Task as CoroTask - participant Await as SqliteAwaitable - participant Pool as SQLite ThreadPool - participant DB as sqlite3 - - Task->>Await: co_await sqlite::run(fn) - Await->>Pool: submit work - Note over Task: suspended - Pool->>DB: execute SQL - DB-->>Pool: result - Pool-->>Await: complete - Await-->>Task: resume with result - -Database Management -------------------- - -The ``SqliteDatabase`` class wraps a SQLite connection: - -Opening a Database -~~~~~~~~~~~~~~~~~~~ - -.. code-block:: cpp - - #include - - // Create and open an in-memory database - sqlite::SqliteDatabase db; - db.open(":memory:"); - - // Or open a file-backed database - sqlite::SqliteDatabase db("path/to/db.sqlite"); - - // Check if open - if (db.is_open()) { - // Database is ready - } - - // Get the raw sqlite3* handle for advanced SQLite API - sqlite3 *raw_db = db.get(); - -Custom VFS -~~~~~~~~~~ - -For databases that should use the async I/O backend: - -.. code-block:: cpp - - // Register the dftracer async I/O VFS - // This is typically done once at application startup - sqlite::register_dftracer_sqlite_vfs(io_backend, executor); - - // Then open with the custom VFS - sqlite::SqliteDatabase db; - db.open_with_vfs("trace.db", "dftracer"); - - // Later, unregister when shutting down - sqlite::unregister_dftracer_sqlite_vfs(); - -VFS Implementation Details -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``DfTracerSqliteVfs`` is a custom SQLite Virtual File System that: - -- Replaces SQLite's default file I/O with async operations from the I/O backend -- Handles WAL mode, synchronization, and shared memory regions -- Integrates with the Executor to resume coroutines on completion - -Prepared Statements -------------------- - -The ``SqliteStmt`` class wraps a compiled SQL statement: - -Binding Parameters -~~~~~~~~~~~~~~~~~~~ - -SQLite uses placeholders (``?``, ``?1``, ``:name``) in SQL. Bind values before execution: - -.. code-block:: cpp - - #include - - sqlite::SqliteStmt stmt(db, "INSERT INTO logs (id, message, level) VALUES (?, ?, ?)"); - - stmt.bind_int(1, 42); - stmt.bind_text(2, "Connection opened"); - stmt.bind_int(3, INFO_LEVEL); - - // Execute and handle result... - -Binding Functions -~~~~~~~~~~~~~~~~~ - -Statement Execution (Async) ---------------------------- - -Use the ``SqliteAwaitable`` template to execute arbitrary database operations asynchronously: - -The Generic ``run()`` Function -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For simple async database work that doesn't require a ``SqliteDatabase`` object: - -Example: Async Query -^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: cpp - - #include - #include - #include - #include - #include - - using namespace dftracer::utils; - - CoroTask count_events(sqlite::SqliteDatabase& db) { - // Define work to run on the sqlite thread pool - auto result = co_await sqlite::run([&db]() { - sqlite::SqliteStmt stmt(db, "SELECT COUNT(*) FROM events"); - // Use SQLite C API directly - sqlite3 *raw_db = db.get(); - sqlite3_stmt *raw_stmt = stmt.get(); - - int count = 0; - if (sqlite3_step(raw_stmt) == SQLITE_ROW) { - count = sqlite3_column_int(raw_stmt, 0); - } - return count; - }); - - std::cout << "Total events: " << result << std::endl; - } - -Async Submission Helpers -~~~~~~~~~~~~~~~~~~~~~~~~ - -Low-level helpers for integrating with the executor and thread pool: - -Error Handling --------------- - -The ``SqliteError`` exception class represents database errors: - -Error Types -~~~~~~~~~~~ - -Errors are categorized into types: - -- ``DATABASE_ERROR``: SQLite runtime error (e.g., constraint violation, locked database) -- ``STATEMENT_ERROR``: Prepared statement compilation or execution error -- ``OPEN_ERROR``: Database open failure -- ``VFS_ERROR``: VFS registration or I/O error -- ``UNKNOWN_ERROR``: Unexpected error condition - -Example: Error Handling -^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: cpp - - #include - #include - - try { - sqlite::SqliteDatabase db("data.db"); - // ... database operations ... - } catch (const sqlite::SqliteError& e) { - if (e.type() == sqlite::SqliteError::OPEN_ERROR) { - std::cerr << "Cannot open database: " << e.what() << std::endl; - } else { - std::cerr << "Database error: " << e.what() << std::endl; - } - } - -Complete Async Example ----------------------- - -A complete example showing async database initialization, insertion, and querying: - -.. code-block:: cpp - - #include - #include - #include - #include - #include - #include - #include - - using namespace dftracer::utils; - - // Initialize database schema async - CoroTask init_db(sqlite::SqliteDatabase& db) { - co_await sqlite::run([&db]() { - sqlite3_exec(db.get(), - "CREATE TABLE IF NOT EXISTS logs (" - " id INTEGER PRIMARY KEY," - " timestamp INTEGER," - " message TEXT" - ");", - nullptr, nullptr, nullptr); - return true; - }); - std::cout << "Database initialized" << std::endl; - } - - // Insert a log entry async - CoroTask insert_log(sqlite::SqliteDatabase& db, int id, - long ts, const std::string& msg) { - co_await sqlite::run([&db, id, ts, &msg]() { - sqlite::SqliteStmt stmt(db, "INSERT INTO logs VALUES (?, ?, ?)"); - stmt.bind_int(1, id); - stmt.bind_int64(2, ts); - stmt.bind_text(3, msg); - - sqlite3_step(stmt.get()); - return true; - }); - std::cout << "Inserted log entry " << id << std::endl; - } - - // Query logs async - CoroTask query_logs(sqlite::SqliteDatabase& db) { - auto count = co_await sqlite::run([&db]() { - sqlite::SqliteStmt stmt(db, "SELECT COUNT(*) FROM logs"); - sqlite3_step(stmt.get()); - return sqlite3_column_int(stmt.get(), 0); - }); - std::cout << "Database has " << count << " log entries" << std::endl; - } - - // Main coroutine - CoroTask main_app() { - sqlite::SqliteDatabase db; - db.open(":memory:"); - - co_await init_db(db); - co_await insert_log(db, 1, 1000, "First event"); - co_await insert_log(db, 2, 2000, "Second event"); - co_await query_logs(db); - } - - int main() { - auto config = PipelineConfig() - .with_name("SqliteExample") - .with_compute_threads(1); - - auto task = make_task([](CoroScope& scope) -> CoroTask { - co_await main_app(); - }, "MainApp"); - - Pipeline pipeline(config); - pipeline.set_source({task}); - pipeline.execute(); - return 0; - } - -Raw SQLite API Access ---------------------- - -For advanced use cases, you can access the underlying SQLite C API directly: - -.. code-block:: cpp - - #include - #include - - sqlite::SqliteDatabase db("app.db"); - - // Get raw sqlite3* handle - sqlite3 *raw_db = db.get(); - - // Use any SQLite C API function - const char *sql = "SELECT * FROM users WHERE id = ?"; - sqlite3_stmt *stmt = nullptr; - sqlite3_prepare_v2(raw_db, sql, -1, &stmt, nullptr); - - // ... bind parameters and execute ... - sqlite3_finalize(stmt); - -Sync Operations Outside Executor ---------------------------------- - -``SqliteDatabase`` can be used outside the coroutine executor for synchronous -operations. When ``SqliteAwaitable`` detects no executor thread pool -(``pool_ == nullptr``), it executes the operation inline in ``await_ready()`` -without suspending the coroutine. - -For fully synchronous usage (no executor at all), use ``SqliteDatabase`` -directly with the raw SQLite C API: - -.. code-block:: cpp - - #include - - sqlite::SqliteDatabase db("data.db"); - - // Use sqlite3 C API directly - sqlite3 *raw = db.get(); - - sqlite3_exec(raw, "CREATE TABLE IF NOT EXISTS kv (k TEXT, v TEXT)", - nullptr, nullptr, nullptr); - - sqlite3_stmt *stmt = nullptr; - sqlite3_prepare_v2(raw, "INSERT INTO kv VALUES (?, ?)", -1, &stmt, nullptr); - sqlite3_bind_text(stmt, 1, "key", -1, SQLITE_STATIC); - sqlite3_bind_text(stmt, 2, "value", -1, SQLITE_STATIC); - sqlite3_step(stmt); - sqlite3_finalize(stmt); - - db.close(); diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 5d526975..d3637b6d 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -43,20 +43,19 @@ Before building dftracer utilities, ensure you have: - CMake 3.5 or higher - C++20 compatible compiler (GCC 11+, Clang 14+) - zlib development library -- SQLite3 development library - pkg-config On Ubuntu/Debian: .. code-block:: bash - sudo apt-get install cmake build-essential zlib1g-dev libsqlite3-dev pkg-config + sudo apt-get install cmake build-essential zlib1g-dev pkg-config On macOS: .. code-block:: bash - brew install cmake zlib sqlite pkg-config + brew install cmake zlib pkg-config Building from Source ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index d51db6f9..11cbee48 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -246,7 +246,7 @@ Create and use indexes for faster access: from dftracer.utils import Indexer # Create an indexer - indexer = Indexer("trace.pfw.gz", "trace.pfw.gz.idx") + indexer = Indexer("trace.pfw.gz") # Build the index if needed if indexer.need_rebuild(): diff --git a/docs/source/utilities/indexer.rst b/docs/source/utilities/indexer.rst index 8fd06a4b..6306ecd8 100644 --- a/docs/source/utilities/indexer.rst +++ b/docs/source/utilities/indexer.rst @@ -244,6 +244,11 @@ Python API with Indexer("trace.pfw.gz", build_bloom=True) as indexer: indexer.build() # reuses checkpoints, adds bloom only + # Wrapper cleanup only; the shared .dftindex store remains available + indexer = Indexer("trace.pfw.gz") + indexer.build() + indexer.close() + # With explicit Runtime for thread pool control from dftracer.utils import Runtime @@ -270,7 +275,7 @@ Python API partial = reader.read_lines(start_line=100, end_line=200) # Properties - print(reader.has_index) # True if .idx exists + print(reader.has_index) # True if .dftindex exists print(reader.num_lines) # precise line count # Context manager diff --git a/include/dftracer/utils/core/common/constants.h b/include/dftracer/utils/core/common/constants.h index a6a8b740..72e27dcf 100644 --- a/include/dftracer/utils/core/common/constants.h +++ b/include/dftracer/utils/core/common/constants.h @@ -22,7 +22,7 @@ static constexpr std::uint64_t DEFAULT_CHECKPOINT_SIZE = static constexpr std::size_t DEFAULT_INDEX_SIZE_THRESHOLD = 1 * 1024 * 1024; // 1MB extern const char* const& SQL_SCHEMA; -inline const char* EXTENSION = ".idx"; +inline const char* EXTENSION = ".dftindex"; } // namespace indexer namespace reader { @@ -45,7 +45,7 @@ static constexpr std::size_t FILE_IO_BUFFER_SIZE = #define DFTRACER_UTILS_DEFAULT_BUFFER_SIZE 65536 #define DFTRACER_UTILS_SKIP_BUFFER_SIZE 131072 #define DFTRACER_UTILS_FILE_IO_BUFFER_SIZE 262144 -#define DFTRACER_UTILS_INDEX_EXTENSION ".idx" +#define DFTRACER_UTILS_INDEX_EXTENSION ".dftindex" extern const char *DFTRACER_UTILS_SQL_SCHEMA; diff --git a/include/dftracer/utils/core/common/scoped_fd.h b/include/dftracer/utils/core/common/scoped_fd.h new file mode 100644 index 00000000..49d3139c --- /dev/null +++ b/include/dftracer/utils/core/common/scoped_fd.h @@ -0,0 +1,44 @@ +#ifndef DFTRACER_UTILS_CORE_COMMON_SCOPED_FD_H +#define DFTRACER_UTILS_CORE_COMMON_SCOPED_FD_H + +#include + +namespace dftracer::utils { + +struct ScopedFd { + int value = -1; + + ScopedFd() = default; + explicit ScopedFd(int fd) : value(fd) {} + + ScopedFd(const ScopedFd&) = delete; + ScopedFd& operator=(const ScopedFd&) = delete; + + ScopedFd(ScopedFd&& other) noexcept : value(other.value) { + other.value = -1; + } + + ScopedFd& operator=(ScopedFd&& other) noexcept { + if (this != &other) { + reset(); + value = other.value; + other.value = -1; + } + return *this; + } + + ~ScopedFd() { reset(); } + + void reset() { + if (value >= 0) { + ::close(value); + value = -1; + } + } + + int get() const { return value; } +}; + +} // namespace dftracer::utils + +#endif diff --git a/include/dftracer/utils/core/env.h b/include/dftracer/utils/core/env.h new file mode 100644 index 00000000..4a8003f9 --- /dev/null +++ b/include/dftracer/utils/core/env.h @@ -0,0 +1,35 @@ +#ifndef DFTRACER_UTILS_CORE_ENV_H +#define DFTRACER_UTILS_CORE_ENV_H + +#include +#include +#include + +namespace dftracer::utils { + +class Env { + public: + template + static std::optional get(std::string_view name); + + static int rocksdb_max_open_files(); +}; + +template +std::optional Env::get(std::string_view name) { + static_assert(sizeof(T) == 0, + "Env::get() requires an explicit specialization"); + (void)name; + return std::nullopt; +} + +template <> +std::optional Env::get( + std::string_view name); + +template <> +std::optional Env::get(std::string_view name); + +} // namespace dftracer::utils + +#endif // DFTRACER_UTILS_CORE_ENV_H diff --git a/include/dftracer/utils/core/io/io_backend.h b/include/dftracer/utils/core/io/io_backend.h index 63894f46..232f0dab 100644 --- a/include/dftracer/utils/core/io/io_backend.h +++ b/include/dftracer/utils/core/io/io_backend.h @@ -13,6 +13,8 @@ namespace dftracer::utils::io { +using IoCompletionFn = void (*)(void *context, ssize_t result) noexcept; + /// Backend selection preference. enum class IoBackendType { AUTO, // Runtime detection: io_uring > epoll/kqueue+threadpool > threadpool @@ -46,6 +48,12 @@ class IoBackend { virtual IoAwaitable submit_pread(int fd, void *buf, std::size_t len, off_t offset) = 0; + /// Submit an async positional read with a completion callback. + /// The callback receives either a byte count or a negative errno. + virtual void submit_pread_callback(int fd, void *buf, std::size_t len, + off_t offset, IoCompletionFn completion, + void *context) = 0; + /// Submit an async positional write. Only seekable fds. virtual IoAwaitable submit_pwrite(int fd, const void *buf, std::size_t len, off_t offset) = 0; diff --git a/include/dftracer/utils/core/pipeline/executor.h b/include/dftracer/utils/core/pipeline/executor.h index 334c3e6c..f2daf5e2 100644 --- a/include/dftracer/utils/core/pipeline/executor.h +++ b/include/dftracer/utils/core/pipeline/executor.h @@ -42,7 +42,7 @@ struct ExecutorConfig { std::size_t io_pool_size = 4; io::IoBackendType io_backend_type = io::IoBackendType::AUTO; unsigned io_batch_threshold = 16; - std::size_t sqlite_pool_size = 2; + std::size_t db_pool_size = 2; }; /** @@ -229,14 +229,14 @@ class Executor { // I/O backend (owned by executor, created by factory) std::unique_ptr io_backend_; - // Dedicated thread pool for SQLite async operations - std::unique_ptr sqlite_pool_; + // Dedicated thread pool for blocking DB operations. + std::unique_ptr db_pool_; // Configuration (stored from ExecutorConfig) std::size_t io_pool_size_ = 4; io::IoBackendType io_backend_type_ = io::IoBackendType::AUTO; unsigned io_batch_threshold_ = 16; - std::size_t sqlite_pool_size_ = 2; + std::size_t db_pool_size_ = 2; public: /** @@ -306,9 +306,9 @@ class Executor { const io::IoBackend& io_backend() const { return *io_backend_; } /** - * Get the dedicated SQLite thread pool (nullptr if not started). + * Get the dedicated DB thread pool (nullptr if not started). */ - io::IoThreadPool* sqlite_pool() noexcept; + io::IoThreadPool* db_pool() noexcept; /** * Get the executor running on the current worker thread (nullptr diff --git a/include/dftracer/utils/core/pipeline/pipeline_config.h b/include/dftracer/utils/core/pipeline/pipeline_config.h index 1e02257e..5f727620 100644 --- a/include/dftracer/utils/core/pipeline/pipeline_config.h +++ b/include/dftracer/utils/core/pipeline/pipeline_config.h @@ -73,7 +73,7 @@ struct PipelineConfig { io::IoBackendType io_backend_type = io::IoBackendType::AUTO; // Backend selection unsigned io_batch_threshold = 16; // SQE batch threshold (0 = per-op) - std::size_t sqlite_pool_size = 2; // SQLite async thread pool size + std::size_t db_pool_size = 2; // Blocking DB async thread pool size /** * Set pipeline name @@ -201,10 +201,10 @@ struct PipelineConfig { } /** - * Set SQLite async thread pool size (default 2) + * Set blocking DB async thread pool size (default 2) */ - PipelineConfig& with_sqlite_pool_size(std::size_t size) { - sqlite_pool_size = size; + PipelineConfig& with_db_pool_size(std::size_t size) { + db_pool_size = size; return *this; } diff --git a/include/dftracer/utils/core/rocksdb/async.h b/include/dftracer/utils/core/rocksdb/async.h new file mode 100644 index 00000000..3ff71a07 --- /dev/null +++ b/include/dftracer/utils/core/rocksdb/async.h @@ -0,0 +1,130 @@ +#ifndef DFTRACER_UTILS_CORE_ROCKSDB_ASYNC_H +#define DFTRACER_UTILS_CORE_ROCKSDB_ASYNC_H + +#include +#include +#include +#include +#include + +namespace dftracer::utils::io { +class IoThreadPool; +} // namespace dftracer::utils::io + +namespace dftracer::utils::rocksdb { + +io::IoThreadPool* get_db_pool(); +void db_async_submit(io::IoThreadPool* pool, std::function fn); +void db_async_resume_on(void* executor, std::coroutine_handle<> h); +void* get_current_executor_opaque(); + +template +class DbAwaitable { + io::IoThreadPool* pool_; + void* executor_; + std::function fn_; + std::optional result_; + std::exception_ptr error_; + std::coroutine_handle<> handle_; + + public: + DbAwaitable(io::IoThreadPool* pool, void* executor, std::function fn) + : pool_(pool), executor_(executor), fn_(std::move(fn)) {} + + bool await_ready() noexcept { + if (pool_ == nullptr) { + try { + auto fn = std::move(fn_); + fn_ = {}; + result_.emplace(fn()); + } catch (...) { + error_ = std::current_exception(); + } + return true; + } + return false; + } + + void await_suspend(std::coroutine_handle<> h) { + handle_ = h; + auto* self = this; + db_async_submit(pool_, [self] { + try { + auto fn = std::move(self->fn_); + self->fn_ = {}; + self->result_.emplace(fn()); + } catch (...) { + self->error_ = std::current_exception(); + } + db_async_resume_on(self->executor_, self->handle_); + }); + } + + T await_resume() { + if (error_ != nullptr) { + std::rethrow_exception(error_); + } + return std::move(*result_); + } +}; + +template <> +class DbAwaitable { + io::IoThreadPool* pool_; + void* executor_; + std::function fn_; + std::exception_ptr error_; + std::coroutine_handle<> handle_; + + public: + DbAwaitable(io::IoThreadPool* pool, void* executor, + std::function fn) + : pool_(pool), executor_(executor), fn_(std::move(fn)) {} + + bool await_ready() noexcept { + if (pool_ == nullptr) { + try { + auto fn = std::move(fn_); + fn_ = {}; + fn(); + } catch (...) { + error_ = std::current_exception(); + } + return true; + } + return false; + } + + void await_suspend(std::coroutine_handle<> h) { + handle_ = h; + auto* self = this; + db_async_submit(pool_, [self] { + try { + auto fn = std::move(self->fn_); + self->fn_ = {}; + fn(); + } catch (...) { + self->error_ = std::current_exception(); + } + db_async_resume_on(self->executor_, self->handle_); + }); + } + + void await_resume() { + if (error_ != nullptr) { + std::rethrow_exception(error_); + } + } +}; + +template +auto run(F&& fn) -> DbAwaitable { + using R = decltype(fn()); + auto* pool = get_db_pool(); + auto* executor = get_current_executor_opaque(); + return DbAwaitable(pool, executor, std::forward(fn)); +} + +} // namespace dftracer::utils::rocksdb + +#endif // DFTRACER_UTILS_CORE_ROCKSDB_ASYNC_H diff --git a/include/dftracer/utils/core/rocksdb/database.h b/include/dftracer/utils/core/rocksdb/database.h new file mode 100644 index 00000000..e4d70216 --- /dev/null +++ b/include/dftracer/utils/core/rocksdb/database.h @@ -0,0 +1,82 @@ +#ifndef DFTRACER_UTILS_CORE_ROCKSDB_DATABASE_H +#define DFTRACER_UTILS_CORE_ROCKSDB_DATABASE_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::rocksdb { + +void mark_process_exiting_for_rocksdb(); + +class RocksDatabase { + public: + using Batch = ::rocksdb::WriteBatch; + enum class OpenMode { ReadWrite, ReadOnly }; + + RocksDatabase(); + explicit RocksDatabase(const std::string& db_path, + OpenMode open_mode = OpenMode::ReadWrite); + ~RocksDatabase(); + + RocksDatabase(const RocksDatabase&) = delete; + RocksDatabase& operator=(const RocksDatabase&) = delete; + + RocksDatabase(RocksDatabase&& other) noexcept; + RocksDatabase& operator=(RocksDatabase&& other) noexcept; + + bool open(const std::string& db_path, + OpenMode open_mode = OpenMode::ReadWrite); + void close(); + + bool is_open() const noexcept; + bool is_read_only() const noexcept; + const std::string& path() const noexcept; + ::rocksdb::DB* get() const noexcept; + + ::rocksdb::Status put(std::string_view key, std::string_view value, + std::string_view column_family = "default"); + ::rocksdb::Status get(std::string_view key, std::string* value, + std::string_view column_family = "default") const; + ::rocksdb::Status del(std::string_view key, + std::string_view column_family = "default"); + + ::rocksdb::Status put(Batch& batch, std::string_view column_family, + std::string_view key, std::string_view value); + ::rocksdb::Status del(Batch& batch, std::string_view column_family, + std::string_view key); + + Batch begin_batch() const; + ::rocksdb::Status commit_batch(Batch& batch); + + std::unique_ptr<::rocksdb::Iterator> new_iterator( + std::string_view column_family = "default") const; + + static std::vector default_column_families(); + static ::rocksdb::Options default_options(); + static ::rocksdb::ColumnFamilyOptions default_column_family_options(); + + private: + ::rocksdb::ColumnFamilyHandle* column_family_handle( + std::string_view column_family) const; + + std::string db_path_; + OpenMode open_mode_ = OpenMode::ReadWrite; + std::shared_ptr<::rocksdb::FileSystem> file_system_; + std::unique_ptr<::rocksdb::Env> env_; + ::rocksdb::DB* db_ = nullptr; + std::unordered_map + column_families_; +}; + +} // namespace dftracer::utils::rocksdb + +#endif // DFTRACER_UTILS_CORE_ROCKSDB_DATABASE_H diff --git a/include/dftracer/utils/core/rocksdb/db_manager.h b/include/dftracer/utils/core/rocksdb/db_manager.h new file mode 100644 index 00000000..c45eec95 --- /dev/null +++ b/include/dftracer/utils/core/rocksdb/db_manager.h @@ -0,0 +1,40 @@ +#ifndef DFTRACER_UTILS_CORE_ROCKSDB_DB_MANAGER_H +#define DFTRACER_UTILS_CORE_ROCKSDB_DB_MANAGER_H + +#include + +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::rocksdb { + +// Process-wide registry of open RocksDB instances keyed by their normalized +// .dftindex root path. The manager owns one live instance per path so short- +// lived wrappers (IndexDatabase, ProvenanceDatabase, Python bindings, etc.) +// reuse the same DB instead of repeatedly reopening it. +class RocksDBManager { + public: + static RocksDBManager& instance(); + + std::shared_ptr get_or_open( + const std::string& db_path, + RocksDatabase::OpenMode open_mode = RocksDatabase::OpenMode::ReadWrite); + void reset(const std::string& db_path); + void shutdown(); + + private: + RocksDBManager() = default; + + std::mutex mutex_; + std::condition_variable cv_; + std::unordered_map> databases_; + std::unordered_set opening_; +}; + +} // namespace dftracer::utils::rocksdb + +#endif // DFTRACER_UTILS_CORE_ROCKSDB_DB_MANAGER_H diff --git a/include/dftracer/utils/core/rocksdb/filesystem.h b/include/dftracer/utils/core/rocksdb/filesystem.h new file mode 100644 index 00000000..8d70429c --- /dev/null +++ b/include/dftracer/utils/core/rocksdb/filesystem.h @@ -0,0 +1,19 @@ +#ifndef DFTRACER_UTILS_CORE_ROCKSDB_FILESYSTEM_H +#define DFTRACER_UTILS_CORE_ROCKSDB_FILESYSTEM_H + +#include + +namespace rocksdb { +class Env; +class FileSystem; +} // namespace rocksdb + +namespace dftracer::utils::rocksdb { + +std::shared_ptr<::rocksdb::FileSystem> make_dftracer_file_system(); +std::unique_ptr<::rocksdb::Env> make_dftracer_env( + const std::shared_ptr<::rocksdb::FileSystem>& file_system); + +} // namespace dftracer::utils::rocksdb + +#endif // DFTRACER_UTILS_CORE_ROCKSDB_FILESYSTEM_H diff --git a/include/dftracer/utils/core/rocksdb/key_codec.h b/include/dftracer/utils/core/rocksdb/key_codec.h new file mode 100644 index 00000000..59223617 --- /dev/null +++ b/include/dftracer/utils/core/rocksdb/key_codec.h @@ -0,0 +1,39 @@ +#ifndef DFTRACER_UTILS_CORE_ROCKSDB_KEY_CODEC_H +#define DFTRACER_UTILS_CORE_ROCKSDB_KEY_CODEC_H + +#include +#include +#include + +namespace dftracer::utils::rocksdb { + +class KeyCodec { + public: + static std::string encode_be32(std::uint32_t value); + static std::string encode_be64(std::uint64_t value); + + static std::uint32_t decode_be32(std::string_view bytes); + static std::uint64_t decode_be64(std::string_view bytes); + + static void append_be32(std::string& out, std::uint32_t value); + static void append_be64(std::string& out, std::uint64_t value); +}; + +class KeyBuilder { + public: + KeyBuilder& append_tag(std::string_view tag); + KeyBuilder& append_separator(); + KeyBuilder& append_string(std::string_view value); + KeyBuilder& append_be32(std::uint32_t value); + KeyBuilder& append_be64(std::uint64_t value); + + std::string build() const; + void clear(); + + private: + std::string key_; +}; + +} // namespace dftracer::utils::rocksdb + +#endif // DFTRACER_UTILS_CORE_ROCKSDB_KEY_CODEC_H diff --git a/include/dftracer/utils/core/runtime.h b/include/dftracer/utils/core/runtime.h index 77d32f2b..0e617a93 100644 --- a/include/dftracer/utils/core/runtime.h +++ b/include/dftracer/utils/core/runtime.h @@ -91,9 +91,13 @@ TypedTaskHandle Runtime::submit(coro::CoroTask task, std::string name) { std::shared_ptr> task_id) -> coro::Coro { try { T val = co_await std::move(t); + t = coro::CoroTask{std::coroutine_handle< + typename coro::CoroTask::promise_type>{}}; exec->mark_coro_completed(task_id->load(std::memory_order_acquire)); tp->set_value(std::move(val)); } catch (...) { + t = coro::CoroTask{std::coroutine_handle< + typename coro::CoroTask::promise_type>{}}; exec->mark_coro_completed(task_id->load(std::memory_order_acquire)); auto ex = std::current_exception(); tp->set_exception(ex); diff --git a/include/dftracer/utils/core/sqlite/async.h b/include/dftracer/utils/core/sqlite/async.h deleted file mode 100644 index c907f619..00000000 --- a/include/dftracer/utils/core/sqlite/async.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef DFTRACER_UTILS_CORE_SQLITE_ASYNC_H -#define DFTRACER_UTILS_CORE_SQLITE_ASYNC_H - -#include -#include -#include - -namespace dftracer::utils::io { -class IoThreadPool; -} // namespace dftracer::utils::io - -namespace dftracer::utils::sqlite { - -// Returns the sqlite IoThreadPool from Executor::current(), or nullptr. -// Defined in async.cpp to avoid Executor header dependency. -io::IoThreadPool *get_sqlite_pool(); - -// Non-template helper — submits work to the pool. -// Defined in async.cpp where IoThreadPool is visible. -void sqlite_async_submit(io::IoThreadPool *pool, std::function fn); - -// Resumes the coroutine on the given executor, or inline if null. -// The executor pointer must be captured at await_suspend time (on an -// executor worker thread), NOT retrieved via TLS at resume time -// (which may be on a non-executor pool thread). -// Defined in async.cpp to avoid Executor header dependency. -void sqlite_async_resume_on(void *executor, std::coroutine_handle<> h); - -// Returns an opaque pointer to Executor::current() for capture. -// Defined in async.cpp to avoid Executor header dependency. -void *get_current_executor_opaque(); - -template -class SqliteAwaitable { - io::IoThreadPool *pool_; - std::function fn_; - T result_{}; - std::coroutine_handle<> handle_; - - public: - SqliteAwaitable(io::IoThreadPool *pool, std::function fn) - : pool_(pool), fn_(std::move(fn)) {} - - bool await_ready() noexcept { - if (pool_ == nullptr) { - result_ = fn_(); - return true; - } - return false; - } - - void await_suspend(std::coroutine_handle<> h) { - handle_ = h; - auto *self = this; - // Capture the executor while still on a worker thread. - // The lambda runs on the sqlite pool thread where TLS is unset. - void *exec = get_current_executor_opaque(); - sqlite_async_submit(pool_, [self, exec] { - self->result_ = self->fn_(); - sqlite_async_resume_on(exec, self->handle_); - }); - } - - T await_resume() { return std::move(result_); } -}; - -template <> -class SqliteAwaitable { - io::IoThreadPool *pool_; - std::function fn_; - std::coroutine_handle<> handle_; - - public: - SqliteAwaitable(io::IoThreadPool *pool, std::function fn) - : pool_(pool), fn_(std::move(fn)) {} - - bool await_ready() noexcept { - if (pool_ == nullptr) { - fn_(); - return true; - } - return false; - } - - void await_suspend(std::coroutine_handle<> h) { - handle_ = h; - auto *self = this; - // Capture the executor while still on a worker thread. - void *exec = get_current_executor_opaque(); - sqlite_async_submit(pool_, [self, exec] { - self->fn_(); - sqlite_async_resume_on(exec, self->handle_); - }); - } - - void await_resume() {} -}; - -// Free function — offload arbitrary work to the sqlite thread pool. -// Use when you don't have a SqliteDatabase instance yet (e.g. the -// lambda creates its own database internally). -// Returns the pool-backed awaitable, or runs fn inline when no pool. -template -auto run(F &&fn) -> SqliteAwaitable { - using R = decltype(fn()); - auto *pool = get_sqlite_pool(); - return SqliteAwaitable(pool, std::forward(fn)); -} - -} // namespace dftracer::utils::sqlite - -#endif // DFTRACER_UTILS_CORE_SQLITE_ASYNC_H diff --git a/include/dftracer/utils/core/sqlite/database.h b/include/dftracer/utils/core/sqlite/database.h deleted file mode 100644 index 13baa34d..00000000 --- a/include/dftracer/utils/core/sqlite/database.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef DFTRACER_UTILS_CORE_SQLITE_DATABASE_H -#define DFTRACER_UTILS_CORE_SQLITE_DATABASE_H - -#include - -#include - -namespace dftracer::utils::sqlite { - -class SqliteDatabase { - public: - SqliteDatabase(); - explicit SqliteDatabase(const std::string &db_path); - ~SqliteDatabase(); - - SqliteDatabase(const SqliteDatabase &) = delete; - SqliteDatabase &operator=(const SqliteDatabase &) = delete; - - SqliteDatabase(SqliteDatabase &&other) noexcept; - SqliteDatabase &operator=(SqliteDatabase &&other) noexcept; - - bool open(const std::string &db_path); - void close(); - bool open_with_vfs(const std::string &db_path, const char *vfs_name); - - sqlite3 *get() const; - bool is_open() const; - - private: - std::string db_path_; - sqlite3 *db_; -}; - -} // namespace dftracer::utils::sqlite - -#endif // DFTRACER_UTILS_CORE_SQLITE_DATABASE_H diff --git a/include/dftracer/utils/core/sqlite/error.h b/include/dftracer/utils/core/sqlite/error.h deleted file mode 100644 index 21707565..00000000 --- a/include/dftracer/utils/core/sqlite/error.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef DFTRACER_UTILS_CORE_SQLITE_ERROR_H -#define DFTRACER_UTILS_CORE_SQLITE_ERROR_H - -#include -#include - -namespace dftracer::utils::sqlite { - -class SqliteError : public std::runtime_error { - public: - enum Type { - DATABASE_ERROR, - STATEMENT_ERROR, - OPEN_ERROR, - VFS_ERROR, - UNKNOWN_ERROR - }; - - SqliteError(Type type, const std::string &message) - : std::runtime_error(format_message(type, message)), type_(type) {} - - inline Type type() const { return type_; } - - private: - Type type_; - static std::string format_message(Type type, const std::string &message); -}; - -} // namespace dftracer::utils::sqlite - -#endif // DFTRACER_UTILS_CORE_SQLITE_ERROR_H diff --git a/include/dftracer/utils/core/sqlite/statement.h b/include/dftracer/utils/core/sqlite/statement.h deleted file mode 100644 index c0a70d07..00000000 --- a/include/dftracer/utils/core/sqlite/statement.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef DFTRACER_UTILS_CORE_SQLITE_STATEMENT_H -#define DFTRACER_UTILS_CORE_SQLITE_STATEMENT_H - -#include - -#include -#include -#include -#include - -namespace dftracer::utils::sqlite { - -class SqliteDatabase; - -class SqliteStmt { - public: - SqliteStmt(const SqliteDatabase &db, const char *sql); - SqliteStmt(sqlite3 *db, const char *sql); - ~SqliteStmt(); - - SqliteStmt(const SqliteStmt &) = delete; - SqliteStmt &operator=(const SqliteStmt &) = delete; - SqliteStmt(SqliteStmt &&other) noexcept : stmt_(other.stmt_) { - other.stmt_ = nullptr; - } - SqliteStmt &operator=(SqliteStmt &&other) noexcept { - if (this != &other) { - if (stmt_) sqlite3_finalize(stmt_); - stmt_ = other.stmt_; - other.stmt_ = nullptr; - } - return *this; - } - - operator sqlite3_stmt *(); - sqlite3_stmt *get(); - - void reset(); - - void bind_int(int index, int value); - void bind_int64(int index, int64_t value); - void bind_double(int index, double value); - void bind_text(int index, const std::string &text); - void bind_text(int index, std::string_view text); - void bind_text(int index, const char *text, int length = -1, - void (*destructor)(void *) = SQLITE_TRANSIENT); - void bind_blob(int index, const void *blob, int length); - void bind_blob(int index, std::span data); - void bind_blob(int index, std::span data); - void bind_blob_static(int index, const void *blob, int length); - void bind_text_static(int index, std::string_view text); - void bind_null(int index); - - void clear_bindings(); - int bind_parameter_count(); - - private: - sqlite3_stmt *stmt_; - - void validate_parameter_index(int index); -}; - -} // namespace dftracer::utils::sqlite - -#endif // DFTRACER_UTILS_CORE_SQLITE_STATEMENT_H diff --git a/include/dftracer/utils/core/sqlite/vfs.h b/include/dftracer/utils/core/sqlite/vfs.h deleted file mode 100644 index 39ca32b0..00000000 --- a/include/dftracer/utils/core/sqlite/vfs.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef DFTRACER_UTILS_CORE_SQLITE_VFS_H -#define DFTRACER_UTILS_CORE_SQLITE_VFS_H - -#include - -#include - -namespace dftracer::utils::io { -class IoBackend; -} // namespace dftracer::utils::io - -namespace dftracer::utils { -class Executor; -} // namespace dftracer::utils - -namespace dftracer::utils::sqlite { - -/// Maximum path length for VFS file paths. -/// Matches mxPathname in the VFS registration. -inline constexpr int VFS_MAX_PATHNAME = 512; - -struct DfTracerSqliteVfsAppData { - io::IoBackend* backend; - Executor* executor; -}; - -struct DfTracerSqliteVfsFile { - sqlite3_file base; // Must be first, SQLite casts to this - io::IoBackend* backend; - Executor* executor; - int fd; - bool read_only; - char path[VFS_MAX_PATHNAME]; - int shm_fd; - int n_shm_region; - void* shm_regions[32]; -}; - -void register_dftracer_sqlite_vfs(io::IoBackend* backend, Executor* executor); -void unregister_dftracer_sqlite_vfs(); - -} // namespace dftracer::utils::sqlite - -#endif // DFTRACER_UTILS_CORE_SQLITE_VFS_H diff --git a/include/dftracer/utils/server/trace_index.h b/include/dftracer/utils/server/trace_index.h index 3afa0e3c..d131cd00 100644 --- a/include/dftracer/utils/server/trace_index.h +++ b/include/dftracer/utils/server/trace_index.h @@ -15,12 +15,12 @@ namespace dftracer::utils::server { /// Scans a directory for trace files and caches paths to their -/// sidecar index file (.idx). Used by API handlers to resolve file +/// root-local `.dftindex` database. Used by API handlers to resolve file /// paths and check index availability. class TraceIndex { public: // Files below this compressed size are streamed directly without - // building a sidecar index file (.idx). At 8 MB compressed + // building a `.dftindex` database. At 8 MB compressed // (~160 MB uncompressed with typical 20x JSON compression), a file // has only a handful of 32 MB checkpoints -- the indexing overhead // exceeds the benefit of bloom-filter skip. @@ -29,7 +29,7 @@ class TraceIndex { struct FileInfo { std::string path; - std::string idx_path; + std::string index_path; bool has_bloom_data = false; bool has_checkpoint_index = false; bool is_small = false; diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h index 9e8d551e..0691c45e 100644 --- a/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h @@ -29,7 +29,7 @@ using dftracer::utils::utilities::composites::dft::DFTracerEvent; struct ChunkAggregatorInput { std::string file_path; - std::string idx_path; + std::string index_path; std::size_t start_byte; std::size_t end_byte; std::size_t start_line; @@ -46,8 +46,8 @@ struct ChunkAggregatorInput { return *this; } - ChunkAggregatorInput& with_idx_path(const std::string& path) { - idx_path = path; + ChunkAggregatorInput& with_index_path(const std::string& path) { + index_path = path; return *this; } diff --git a/include/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.h b/include/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.h index 8d7a79a3..6b0e3fe3 100644 --- a/include/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.h @@ -66,7 +66,7 @@ struct ChunkExtractorUtilityInput { for (const auto& dft_spec : manifest.specs) { fileio::ChunkSpec io_spec; io_spec.file_path = dft_spec.file_path; - io_spec.idx_path = dft_spec.idx_path; + io_spec.index_path = dft_spec.index_path; io_spec.size_mb = dft_spec.size_mb; io_spec.start_byte = dft_spec.start_byte; io_spec.end_byte = dft_spec.end_byte; diff --git a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h index 1129709d..87acb4fa 100644 --- a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h +++ b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h @@ -87,7 +87,7 @@ struct ComparisonConfig { std::size_t executor_threads = 0; /// Checkpoint size for index building (0 = default). std::size_t checkpoint_size = 0; - /// Directory for index sidecar files. + /// Directory for `.dftindex` stores. std::string index_dir; /// Force rebuild of existing indexes. bool force_rebuild = false; diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h b/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h index f80d2862..f6a75b7c 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h @@ -15,7 +15,7 @@ namespace dftracer::utils::utilities::composites::dft::indexing { * * Uses Kirsch-Mitzenmacher optimization: k hash functions derived from * 2 base hash values (std::hash with different seeds). Supports - * serialization to/from BLOB for SQLite storage. + * serialization to/from binary blobs for RocksDB storage. * * Serialization format (self-describing): * [4 bytes: num_hashes (uint32_t LE)] diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter_cache.h b/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter_cache.h index 677dad34..e7c8f8cb 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter_cache.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter_cache.h @@ -13,8 +13,8 @@ namespace dftracer::utils::utilities::composites::dft::indexing { /// Thread-safe bounded cache for deserialized bloom filters. -/// Keyed by (idx_path, dimension, checkpoint_idx) for chunk blooms, -/// or (idx_path, dimension, UINT64_MAX) for file-level blooms. +/// Keyed by (index_path, dimension, checkpoint_idx) for chunk blooms, +/// or (index_path, dimension, UINT64_MAX) for file-level blooms. /// When the cache exceeds max_entries, it is cleared entirely. class BloomFilterCache { public: @@ -25,23 +25,23 @@ class BloomFilterCache { : max_entries_(max_entries) {} /// Look up a cached bloom filter. Returns nullopt on miss. - std::optional get(const std::string& idx_path, + std::optional get(const std::string& index_path, const std::string& dimension, std::uint64_t checkpoint_idx) const { std::lock_guard lock(mutex_); - auto it = cache_.find(make_key(idx_path, dimension, checkpoint_idx)); + auto it = cache_.find(make_key(index_path, dimension, checkpoint_idx)); if (it == cache_.end()) return std::nullopt; return it->second; } /// Insert a bloom filter into the cache. Evicts all entries if full. - void put(const std::string& idx_path, const std::string& dimension, + void put(const std::string& index_path, const std::string& dimension, std::uint64_t checkpoint_idx, const BloomFilter& bloom) { std::lock_guard lock(mutex_); if (cache_.size() >= max_entries_) { cache_.clear(); } - cache_.emplace(make_key(idx_path, dimension, checkpoint_idx), bloom); + cache_.emplace(make_key(index_path, dimension, checkpoint_idx), bloom); } std::size_t size() const { @@ -50,12 +50,12 @@ class BloomFilterCache { } private: - static std::string make_key(const std::string& idx_path, + static std::string make_key(const std::string& index_path, const std::string& dimension, std::uint64_t checkpoint_idx) { std::string key; - key.reserve(idx_path.size() + dimension.size() + 24); - key += idx_path; + key.reserve(index_path.size() + dimension.size() + 24); + key += index_path; key += '\0'; key += dimension; key += '\0'; diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h index 0bec43b7..7807ed77 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h @@ -49,7 +49,7 @@ struct ChunkDimensionStats { decompress_value_counts(const std::uint8_t* data, std::size_t len); }; -/// Result type for querying chunk_dimension_stats from SQLite. +/// Result type for querying chunk_dimension_stats from the shared index DB. struct ChunkDimensionStatsResult { std::uint64_t checkpoint_idx; std::string dimension; diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h index 645d66b0..a49ebe64 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h @@ -121,7 +121,7 @@ struct ChunkIndexState { struct ChunkIndexerInput { std::string file_path; - std::string idx_path; + std::string index_path; std::size_t checkpoint_size = 0; std::uint64_t checkpoint_idx = 0; std::size_t start_byte = 0; @@ -137,8 +137,8 @@ struct ChunkIndexerInput { return *this; } - ChunkIndexerInput& with_idx_path(const std::string& path) { - idx_path = path; + ChunkIndexerInput& with_index_path(const std::string& path) { + index_path = path; return *this; } diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h index 9c645882..cb0f0378 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h @@ -16,7 +16,7 @@ using common::query::Query; /// Input for chunk pruning: index path, file path, query, optional cache. struct ChunkPrunerInput { - std::string idx_path; ///< Path to .idx sidecar file. + std::string index_path; ///< Path to the `.dftindex` store. std::string file_path; ///< Path to trace file. Query query; ///< Query to evaluate for pruning. BloomFilterCache* cache = nullptr; ///< Optional bloom filter cache. diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h index 3399f6da..37abe99c 100644 --- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h +++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h @@ -18,7 +18,8 @@ namespace dftracer::utils::utilities::composites::dft::indexing { * * Tracks event counts by category/name/pid:tid, timestamp ranges, * and duration statistics using Welford's online algorithm for variance. - * Map fields serialize to JSON TEXT for SQLite storage via yyjson. + * Map fields serialize to JSON text via yyjson for storage in the + * shared `.dftindex` database. */ struct ChunkStatistics { std::uint64_t total_events = 0; diff --git a/include/dftracer/utils/utilities/composites/dft/internal/chunk_spec.h b/include/dftracer/utils/utilities/composites/dft/internal/chunk_spec.h index 3e28c469..653c115f 100644 --- a/include/dftracer/utils/utilities/composites/dft/internal/chunk_spec.h +++ b/include/dftracer/utils/utilities/composites/dft/internal/chunk_spec.h @@ -30,7 +30,7 @@ struct DFTracerChunkSpec : public fileio::ChunkSpec { static DFTracerChunkSpec from_chunk_spec(const fileio::ChunkSpec& spec) { DFTracerChunkSpec dft_spec; dft_spec.file_path = spec.file_path; - dft_spec.idx_path = spec.idx_path; + dft_spec.index_path = spec.index_path; dft_spec.size_mb = spec.size_mb; dft_spec.start_byte = spec.start_byte; dft_spec.end_byte = spec.end_byte; diff --git a/include/dftracer/utils/utilities/composites/dft/internal/utils.h b/include/dftracer/utils/utilities/composites/dft/internal/utils.h index 5639b0f6..d2c6db5d 100644 --- a/include/dftracer/utils/utilities/composites/dft/internal/utils.h +++ b/include/dftracer/utils/utilities/composites/dft/internal/utils.h @@ -11,18 +11,15 @@ namespace dftracer::utils::utilities::composites::dft::internal { bool is_data_transfer_op(std::string_view cat, std::string_view name); /** - * @brief Determine the index file path for a given data file. + * @brief Determine the root-local RocksDB index path for a given data file. * - * When a custom index directory is provided, the index is placed there - * directly. Otherwise, a unique subdirectory under /tmp is created - * using a hash of the data file's absolute path, preventing collisions - * when multiple files share the same basename. + * When a custom index directory is provided, the index root is + * `/.dftindex`. Otherwise, the index root is placed alongside the + * data file as `/.dftindex`. * * @param file_path Path to the data file (e.g., "data/trace.pfw.gz") - * @param index_dir Optional custom directory for the index file. - * If empty, uses /tmp/dft_/. - * @return Complete path to the index file - * (e.g., "/tmp/dft_a1b2c3d4/trace.pfw.gz.idx") + * @param index_dir Optional custom directory for the index root. + * @return Path to the owning `.dftindex` directory. */ std::string determine_index_path(const std::string& file_path, const std::string& index_dir = ""); @@ -30,12 +27,12 @@ std::string determine_index_path(const std::string& file_path, /** * @brief Determine the provenance index file path for a given data file. * - * Follows the same placement logic as determine_index_path but produces - * a `.pidx` sidecar instead of `.idx`. + * Provenance now lives in the same root-local `.dftindex` database as + * the regular index data. * * @param data_path Path to the data file * @param index_dir Optional directory. If empty, places next to data file. - * @return Complete path to the provenance index file + * @return Path to the owning `.dftindex` directory */ std::string determine_provenance_index_path(const std::string& data_path, const std::string& index_dir = ""); diff --git a/include/dftracer/utils/utilities/composites/dft/metadata_collector_utility.h b/include/dftracer/utils/utilities/composites/dft/metadata_collector_utility.h index 5cfc2cba..cb0d0b25 100644 --- a/include/dftracer/utils/utilities/composites/dft/metadata_collector_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/metadata_collector_utility.h @@ -17,7 +17,7 @@ namespace dftracer::utils::utilities::composites::dft { */ struct MetadataCollectorUtilityInput { std::string file_path; - std::string idx_path; // Empty for plain files + std::string index_path; // Empty for plain files, otherwise `.dftindex`. std::size_t checkpoint_size = dftracer::utils::utilities::indexer:: internal::Indexer::DEFAULT_CHECKPOINT_SIZE; bool force_rebuild = false; @@ -31,7 +31,7 @@ struct MetadataCollectorUtilityInput { Indexer::DEFAULT_CHECKPOINT_SIZE, bool force = false, bool hash = false) : file_path(std::move(fpath)), - idx_path(std::move(ipath)), + index_path(std::move(ipath)), checkpoint_size(ckpt), force_rebuild(force), compute_hash(hash) {} @@ -43,7 +43,7 @@ struct MetadataCollectorUtilityInput { } MetadataCollectorUtilityInput& with_index(std::string idx) { - idx_path = std::move(idx); + index_path = std::move(idx); return *this; } @@ -63,7 +63,7 @@ struct MetadataCollectorUtilityInput { } bool operator==(const MetadataCollectorUtilityInput& other) const { - return file_path == other.file_path && idx_path == other.idx_path && + return file_path == other.file_path && index_path == other.index_path && checkpoint_size == other.checkpoint_size && force_rebuild == other.force_rebuild && compute_hash == other.compute_hash; @@ -75,7 +75,7 @@ struct MetadataCollectorUtilityInput { */ struct MetadataCollectorUtilityOutput { std::string file_path; - std::string idx_path; + std::string index_path; // Root-local `.dftindex` path when available. double size_mb = 0; std::size_t start_line = 0; std::size_t end_line = 0; @@ -98,7 +98,7 @@ struct MetadataCollectorUtilityOutput { MetadataCollectorUtilityOutput() = default; bool operator==(const MetadataCollectorUtilityOutput& other) const { - return file_path == other.file_path && idx_path == other.idx_path && + return file_path == other.file_path && index_path == other.index_path && size_mb == other.size_mb && start_line == other.start_line && end_line == other.end_line && valid_events == other.valid_events && @@ -119,7 +119,8 @@ struct MetadataCollectorUtilityOutput { * files. * * Supports both plain (.pfw) and compressed (.pfw.gz) files. - * For compressed files, builds/uses an index for efficient access. + * For compressed files, builds/uses the root-local `.dftindex` store for + * efficient access. * * Tagged with Parallelizable - safe for parallel batch processing. */ diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.h b/include/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.h index c02bc3c7..d2bffcd3 100644 --- a/include/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.h +++ b/include/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.h @@ -1,6 +1,7 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_PROVENANCE_TRACKER_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_PROVENANCE_TRACKER_H +#include #include #include @@ -27,10 +28,11 @@ class ProvenanceTracker { void record(int source_file_idx, int checkpoint_idx, int output_chunk_idx, int output_line_start, int output_line_end, int event_count); - void flush_to_db(const ExtractionPlan& plan, const std::string& group_name, - const std::string& group_query, - const std::vector& chunks, - const std::string& output_dir); + coro::CoroTask flush_to_db( + const ExtractionPlan& plan, const std::string& group_name, + const std::string& group_query, + const std::vector& chunks, + const std::string& output_dir); std::size_t record_count() const { return records_.size(); } const std::vector& records() const { return records_; } diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h b/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h index 65a701e1..92ea2f20 100644 --- a/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h +++ b/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h @@ -17,7 +17,7 @@ struct PredicateGroup { struct SourceFileInfo { std::string file_path; - std::string idx_path; + std::string index_path; std::size_t num_checkpoints = 0; std::uint64_t uncompressed_size = 0; std::uint64_t checkpoint_size = 0; diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.h b/include/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.h index 33e38979..be4f18e5 100644 --- a/include/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.h @@ -14,7 +14,7 @@ namespace dftracer::utils::utilities::composites::dft::statistics { struct ChunkDetailScanInput { std::string file_path; - std::string idx_path; + std::string index_path; std::size_t checkpoint_size = 0; std::size_t start_byte = 0; std::size_t end_byte = 0; diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/statistics.h b/include/dftracer/utils/utilities/composites/dft/statistics/statistics.h index 27587e2a..16cbf10b 100644 --- a/include/dftracer/utils/utilities/composites/dft/statistics/statistics.h +++ b/include/dftracer/utils/utilities/composites/dft/statistics/statistics.h @@ -5,7 +5,8 @@ * @file statistics.h * @brief Convenience header for all DFTracer statistics components. * - * Provides zero-cost statistics aggregation from pre-indexed .idx databases: + * Provides zero-cost statistics aggregation from pre-indexed `.dftindex` + * databases: * trace statistics, aggregation, and querying. */ diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h index 2241bb72..7141ba23 100644 --- a/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h @@ -11,7 +11,7 @@ namespace dftracer::utils::utilities::composites::dft::statistics { struct StatisticsAggregatorInput { std::string file_path; - std::string idx_path; + std::string index_path; std::string index_dir; }; diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.h b/include/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.h index 6323a805..a06e182e 100644 --- a/include/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.h +++ b/include/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.h @@ -12,7 +12,7 @@ using indexing::ChunkStatistics; struct TraceStatistics { std::string file_path; - std::string idx_path; + std::string index_path; ChunkStatistics merged; std::uint64_t num_chunks = 0; bool success = false; diff --git a/include/dftracer/utils/utilities/composites/dft/views/view_builder_utility.h b/include/dftracer/utils/utilities/composites/dft/views/view_builder_utility.h index fdb2ccbf..cd8974d9 100644 --- a/include/dftracer/utils/utilities/composites/dft/views/view_builder_utility.h +++ b/include/dftracer/utils/utilities/composites/dft/views/view_builder_utility.h @@ -18,7 +18,7 @@ namespace dftracer::utils::utilities::composites::dft::views { struct ViewBuilderInput { ViewDefinition view; std::string file_path; - std::string idx_path; // index sidecar path + std::string index_path; // `.dftindex` store path std::size_t uncompressed_size = 0; std::size_t num_checkpoints = 0; indexing::BloomFilterCache* bloom_cache = nullptr; @@ -27,7 +27,7 @@ struct ViewBuilderInput { // Fluent builders ViewBuilderInput& with_view(const ViewDefinition& v); ViewBuilderInput& with_file_path(const std::string& path); - ViewBuilderInput& with_idx_path(const std::string& path); + ViewBuilderInput& with_index_path(const std::string& path); ViewBuilderInput& with_uncompressed_size(std::size_t s); ViewBuilderInput& with_num_checkpoints(std::size_t n); ViewBuilderInput& with_bloom_cache(indexing::BloomFilterCache* c); @@ -57,4 +57,4 @@ class ViewBuilderUtility : public Utility query; ViewReaderInput& with_file_path(const std::string& path); - ViewReaderInput& with_idx_path(const std::string& path); + ViewReaderInput& with_index_path(const std::string& path); ViewReaderInput& with_checkpoint_size(std::size_t sz); ViewReaderInput& with_byte_range(std::size_t start, std::size_t end); ViewReaderInput& with_checkpoint_idx(std::uint64_t idx); diff --git a/include/dftracer/utils/utilities/composites/file_merger_utility.h b/include/dftracer/utils/utilities/composites/file_merger_utility.h index 1be3876b..c20d90b2 100644 --- a/include/dftracer/utils/utilities/composites/file_merger_utility.h +++ b/include/dftracer/utils/utilities/composites/file_merger_utility.h @@ -45,8 +45,9 @@ struct FileMergeValidatorUtilityInput { return input; } - FileMergeValidatorUtilityInput& with_index(const std::string& idx_path) { - index_path = idx_path; + FileMergeValidatorUtilityInput& with_index( + const std::string& index_path_value) { + index_path = index_path_value; return *this; } diff --git a/include/dftracer/utils/utilities/composites/indexed_file_reader_utility.h b/include/dftracer/utils/utilities/composites/indexed_file_reader_utility.h index 50b837bd..79c7a6cf 100644 --- a/include/dftracer/utils/utilities/composites/indexed_file_reader_utility.h +++ b/include/dftracer/utils/utilities/composites/indexed_file_reader_utility.h @@ -3,8 +3,11 @@ #include #include +#include #include +#include #include +#include #include #include #include @@ -30,7 +33,7 @@ namespace dftracer::utils::utilities::composites { * @code * IndexedFileReader reader_workflow; * auto reader = reader_workflow.process( - * IndexedReadInput{"file.gz", "file.gz.idx", checkpoint_size, false} + * IndexedReadInput{"file.gz", ".dftindex", checkpoint_size, false} * ); * // Now use reader to read lines * @endcode @@ -52,40 +55,54 @@ class IndexedFileReaderUtility throw std::runtime_error("File does not exist: " + input.file_path); } + const std::string normalized_index_path = + input.index_path.empty() + ? dft::internal::determine_index_path(input.file_path, "") + : indexer::internal::normalize_index_root(input.index_path); + // Step 1: Check if index needs to be built/rebuilt - bool need_build = !fs::exists(input.idx_path) || input.force_rebuild; + bool need_build = + !fs::exists(normalized_index_path) || input.force_rebuild; if (need_build) { // Remove old index if forcing rebuild - if (input.force_rebuild && fs::exists(input.idx_path)) { - fs::remove(input.idx_path); + if (input.force_rebuild && fs::exists(normalized_index_path)) { + // Force rebuild must discard the manager-owned DB instance + // before removing the root directory so the next open is a + // true reopen, not a reuse of the previous live handle. + rocksdb::RocksDBManager::instance().reset( + normalized_index_path); + fs::remove_all(normalized_index_path); } // Build new index auto indexer = dftracer::utils::utilities::indexer::internal:: - IndexerFactory::create(input.file_path, input.idx_path, + IndexerFactory::create(input.file_path, input.index_path, input.checkpoint_size, true); co_await indexer->build_async(); } else { // Check if existing index needs rebuild auto indexer = dftracer::utils::utilities::indexer::internal:: - IndexerFactory::create(input.file_path, input.idx_path, + IndexerFactory::create(input.file_path, input.index_path, input.checkpoint_size, false); if (indexer->need_rebuild()) { // Rebuild the index - fs::remove(input.idx_path); - auto new_indexer = - dftracer::utils::utilities::indexer::internal:: - IndexerFactory::create(input.file_path, input.idx_path, - input.checkpoint_size, true); + // Drop the cached DB instance before deleting the store. + rocksdb::RocksDBManager::instance().reset( + normalized_index_path); + fs::remove_all(normalized_index_path); + auto new_indexer = dftracer::utils::utilities::indexer:: + internal::IndexerFactory::create( + input.file_path, input.index_path, + input.checkpoint_size, true); co_await new_indexer->build_async(); } } // Step 2: Create and return Reader - co_return reader::internal::ReaderFactory::create(input.file_path, - input.idx_path); + co_return reader::internal::ReaderFactory::create( + input.file_path, normalized_index_path); } }; diff --git a/include/dftracer/utils/utilities/composites/line_batch_processor_utility.h b/include/dftracer/utils/utilities/composites/line_batch_processor_utility.h index c5932575..08ebc5da 100644 --- a/include/dftracer/utils/utilities/composites/line_batch_processor_utility.h +++ b/include/dftracer/utils/utilities/composites/line_batch_processor_utility.h @@ -40,7 +40,7 @@ using LineBatchProcessUtilityOutput = std::vector; * * LineBatchProcessor workflow(processor); * auto results = workflow.process(LineBatchInput{"/path/to/file.gz", - * "file.gz.idx"}); + * "/path/to/.dftindex"}); * @endcode */ template @@ -74,10 +74,10 @@ class LineBatchProcessorUtility LineBatchProcessUtilityOutput results; auto gen = [&]() { - if (!input.idx_path.empty()) { + if (!input.index_path.empty()) { auto iter_config = fileio::lines::sources::IndexedFileLineIteratorConfig() - .with_file(input.file_path, input.idx_path); + .with_file(input.file_path, input.index_path); if (input.start_line > 0 && input.end_line > 0) { iter_config.with_line_range(input.start_line, input.end_line); @@ -131,10 +131,10 @@ class SimpleLineBatchProcessorUtility SimpleLineBatchProcessUtilityOutput results; auto gen = [&]() { - if (!input.idx_path.empty()) { + if (!input.index_path.empty()) { auto iter_config = fileio::lines::sources::IndexedFileLineIteratorConfig() - .with_file(input.file_path, input.idx_path); + .with_file(input.file_path, input.index_path); if (input.start_line > 0 && input.end_line > 0) { iter_config.with_line_range(input.start_line, input.end_line); diff --git a/include/dftracer/utils/utilities/composites/types.h b/include/dftracer/utils/utilities/composites/types.h index e7d3be0e..2b9e6d0c 100644 --- a/include/dftracer/utils/utilities/composites/types.h +++ b/include/dftracer/utils/utilities/composites/types.h @@ -59,7 +59,7 @@ struct DirectoryProcessInput { */ struct IndexedReadInput { std::string file_path; - std::string idx_path; + std::string index_path; // Root-local `.dftindex` path. std::size_t checkpoint_size = dftracer::utils::utilities::indexer:: internal::Indexer::DEFAULT_CHECKPOINT_SIZE; bool force_rebuild = false; @@ -71,7 +71,7 @@ struct IndexedReadInput { indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE, bool force = false) : file_path(std::move(fpath)), - idx_path(std::move(ipath)), + index_path(std::move(ipath)), checkpoint_size(ckpt_size), force_rebuild(force) {} @@ -82,7 +82,7 @@ struct IndexedReadInput { } IndexedReadInput& with_index(std::string idx) { - idx_path = std::move(idx); + index_path = std::move(idx); return *this; } @@ -102,7 +102,8 @@ struct IndexedReadInput { */ struct LineBatchInput { std::string file_path; - std::string idx_path; // Empty for plain text files + std::string index_path; // Empty for plain text files + // or `.dftindex` for indexed archives. std::size_t start_line = 0; // 0 = from beginning std::size_t end_line = 0; // 0 = to end std::size_t checkpoint_size = dftracer::utils::utilities::indexer:: @@ -113,7 +114,7 @@ struct LineBatchInput { LineBatchInput(std::string fpath, std::string ipath = "", std::size_t start = 0, std::size_t end = 0) : file_path(std::move(fpath)), - idx_path(std::move(ipath)), + index_path(std::move(ipath)), start_line(start), end_line(end) {} @@ -124,7 +125,7 @@ struct LineBatchInput { } LineBatchInput& with_index(std::string idx) { - idx_path = std::move(idx); + index_path = std::move(idx); return *this; } diff --git a/include/dftracer/utils/utilities/fileio/lines/line_bytes_range.h b/include/dftracer/utils/utilities/fileio/lines/line_bytes_range.h index 987bf25a..4a6f0f7a 100644 --- a/include/dftracer/utils/utilities/fileio/lines/line_bytes_range.h +++ b/include/dftracer/utils/utilities/fileio/lines/line_bytes_range.h @@ -28,7 +28,7 @@ namespace dftracer::utils::utilities::fileio::lines { * Usage: * @code * // From indexed file with byte range - * auto reader = ReaderFactory::create("file.gz", "file.gz.idx"); + * auto reader = ReaderFactory::create("file.gz", "/data/.dftindex"); * LineBytesRange range1 = LineBytesRange::from_indexed_file(reader, 1000, * 5000); * diff --git a/include/dftracer/utils/utilities/fileio/lines/line_types.h b/include/dftracer/utils/utilities/fileio/lines/line_types.h index 8fc937c6..ba4bf62d 100644 --- a/include/dftracer/utils/utilities/fileio/lines/line_types.h +++ b/include/dftracer/utils/utilities/fileio/lines/line_types.h @@ -42,13 +42,14 @@ struct Line { * Usage: * @code * auto input = LineReadInput::from_file("data.txt") - * .with_index("data.txt.idx") + * .with_index("/data/.dftindex") * .with_range(10, 100); * @endcode */ struct LineReadInput { std::string file_path; // Path to the archive file - std::string idx_path; // Path to the index file (empty for plain files) + std::string index_path; // Path to the `.dftindex` store + // (empty for plain files) std::size_t start_line; // Starting line (1-based, inclusive), 0 = start std::size_t end_line; // Ending line (1-based, inclusive), 0 = end @@ -57,7 +58,7 @@ struct LineReadInput { LineReadInput(std::string file_path_, std::string idx_path_, std::size_t start_line_, std::size_t end_line_) : file_path(std::move(file_path_)), - idx_path(std::move(idx_path_)), + index_path(std::move(idx_path_)), start_line(start_line_), end_line(end_line_) {} @@ -68,7 +69,7 @@ struct LineReadInput { } LineReadInput& with_index(std::string idx) { - idx_path = std::move(idx); + index_path = std::move(idx); return *this; } @@ -79,7 +80,7 @@ struct LineReadInput { } bool operator==(const LineReadInput& other) const { - return file_path == other.file_path && idx_path == other.idx_path && + return file_path == other.file_path && index_path == other.index_path && start_line == other.start_line && end_line == other.end_line; } @@ -153,7 +154,7 @@ struct hash { const { ::dftracer::utils::utilities::hash::HasherUtility hasher; hasher.update(req.file_path); - hasher.update(req.idx_path); + hasher.update(req.index_path); hasher.update(req.start_line); hasher.update(req.end_line); return hasher.get_hash().value; diff --git a/include/dftracer/utils/utilities/fileio/lines/sources/async_plain_file_bytes_generator.h b/include/dftracer/utils/utilities/fileio/lines/sources/async_plain_file_bytes_generator.h index 9d89d186..98eef888 100644 --- a/include/dftracer/utils/utilities/fileio/lines/sources/async_plain_file_bytes_generator.h +++ b/include/dftracer/utils/utilities/fileio/lines/sources/async_plain_file_bytes_generator.h @@ -1,6 +1,7 @@ #ifndef DFTRACER_UTILS_UTILITIES_FILEIO_LINES_SOURCES_ASYNC_PLAIN_FILE_BYTES_GENERATOR_H #define DFTRACER_UTILS_UTILITIES_FILEIO_LINES_SOURCES_ASYNC_PLAIN_FILE_BYTES_GENERATOR_H +#include #include #include #include @@ -9,7 +10,6 @@ #include namespace dftracer::utils::utilities::fileio::lines::sources { - /** * @brief Async generator that yields lines from plain text files * within a byte range, with line-boundary alignment. @@ -39,7 +39,7 @@ inline coro::AsyncGenerator async_plain_file_bytes( if (fd_result < 0) { throw std::runtime_error("Cannot open file: " + file_path); } - int fd = static_cast(fd_result); + dftracer::utils::ScopedFd fd(static_cast(fd_result)); std::vector read_buffer(buffer_size); std::string line_buffer; @@ -56,7 +56,8 @@ inline coro::AsyncGenerator async_plain_file_bytes( bool aligned = false; while (!aligned) { ssize_t bytes_read = co_await ::dftracer::utils::io::pread( - fd, read_buffer.data(), read_buffer.size(), file_offset); + fd.get(), read_buffer.data(), read_buffer.size(), + file_offset); if (bytes_read < 0) { throw std::runtime_error( @@ -66,7 +67,7 @@ inline coro::AsyncGenerator async_plain_file_bytes( if (bytes_read == 0) { // Hit EOF before finding a newline — nothing to yield - co_await ::dftracer::utils::io::close(fd); + fd.reset(); co_return; } @@ -80,7 +81,7 @@ inline coro::AsyncGenerator async_plain_file_bytes( if (static_cast(file_offset) >= end_byte) { // Passed end_byte while aligning — nothing to yield - co_await ::dftracer::utils::io::close(fd); + fd.reset(); co_return; } } @@ -96,7 +97,7 @@ inline coro::AsyncGenerator async_plain_file_bytes( } ssize_t bytes_read = co_await ::dftracer::utils::io::pread( - fd, read_buffer.data(), read_buffer.size(), file_offset); + fd.get(), read_buffer.data(), read_buffer.size(), file_offset); if (bytes_read < 0) { throw std::runtime_error( @@ -141,7 +142,6 @@ inline coro::AsyncGenerator async_plain_file_bytes( ex = std::current_exception(); } - co_await ::dftracer::utils::io::close(fd); if (ex) { std::rethrow_exception(ex); } diff --git a/include/dftracer/utils/utilities/fileio/lines/sources/async_plain_file_line_generator.h b/include/dftracer/utils/utilities/fileio/lines/sources/async_plain_file_line_generator.h index e8a0365c..2c851eb9 100644 --- a/include/dftracer/utils/utilities/fileio/lines/sources/async_plain_file_line_generator.h +++ b/include/dftracer/utils/utilities/fileio/lines/sources/async_plain_file_line_generator.h @@ -1,6 +1,7 @@ #ifndef DFTRACER_UTILS_UTILITIES_FILEIO_LINES_SOURCES_ASYNC_PLAIN_FILE_LINE_GENERATOR_H #define DFTRACER_UTILS_UTILITIES_FILEIO_LINES_SOURCES_ASYNC_PLAIN_FILE_LINE_GENERATOR_H +#include #include #include #include @@ -9,7 +10,6 @@ #include namespace dftracer::utils::utilities::fileio::lines::sources { - /** * @brief Async generator that yields lines from plain text files. * @@ -32,7 +32,7 @@ inline coro::AsyncGenerator async_plain_file_lines( if (fd_result < 0) { throw std::runtime_error("Cannot open file: " + file_path); } - int fd = static_cast(fd_result); + dftracer::utils::ScopedFd fd(static_cast(fd_result)); constexpr std::size_t BUFFER_SIZE = 256 * 1024; // 256KB std::vector read_buffer(BUFFER_SIZE); @@ -48,7 +48,7 @@ inline coro::AsyncGenerator async_plain_file_lines( bool eof = false; while (!eof) { ssize_t bytes_read = co_await ::dftracer::utils::io::pread( - fd, read_buffer.data(), BUFFER_SIZE, file_offset); + fd.get(), read_buffer.data(), BUFFER_SIZE, file_offset); if (bytes_read < 0) { throw std::runtime_error( @@ -82,7 +82,7 @@ inline coro::AsyncGenerator async_plain_file_lines( current_line); } if (end_line > 0 && current_line >= end_line) { - co_await ::dftracer::utils::io::close(fd); + fd.reset(); co_return; } line_buffer.clear(); @@ -95,7 +95,6 @@ inline coro::AsyncGenerator async_plain_file_lines( ex = std::current_exception(); } - co_await ::dftracer::utils::io::close(fd); if (ex) { std::rethrow_exception(ex); } diff --git a/include/dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h b/include/dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h index 7769ba5f..aa3b2a2f 100644 --- a/include/dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h +++ b/include/dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h @@ -2,6 +2,7 @@ #define DFTRACER_UTILS_UTILITIES_FILEIO_LINES_SOURCES_ASYNC_STREAMING_GZ_LINE_GENERATOR_H #include +#include #include #include #include @@ -11,7 +12,6 @@ #include namespace dftracer::utils::utilities::fileio::lines::sources { - /** * @brief Async generator that yields lines from .gz files without an index. * @@ -30,7 +30,7 @@ inline coro::AsyncGenerator async_streaming_gz_lines( "Cannot open compressed file: " + file_path + " (errno=" + std::to_string(static_cast(-fd_result)) + ")"); } - int fd = static_cast(fd_result); + dftracer::utils::ScopedFd fd(static_cast(fd_result)); constexpr std::size_t READ_BUFFER_SIZE = 256 * 1024; // 256KB std::vector read_buffer(READ_BUFFER_SIZE); @@ -46,7 +46,7 @@ inline coro::AsyncGenerator async_streaming_gz_lines( try { while (true) { ssize_t bytes_read = co_await ::dftracer::utils::io::pread( - fd, read_buffer.data(), READ_BUFFER_SIZE, file_offset); + fd.get(), read_buffer.data(), READ_BUFFER_SIZE, file_offset); if (bytes_read < 0) { throw std::runtime_error( @@ -94,7 +94,7 @@ inline coro::AsyncGenerator async_streaming_gz_lines( current_line); } if (end_line > 0 && current_line >= end_line) { - co_await ::dftracer::utils::io::close(fd); + fd.reset(); co_return; } line_buffer.clear(); @@ -110,7 +110,6 @@ inline coro::AsyncGenerator async_streaming_gz_lines( ex = std::current_exception(); } - co_await ::dftracer::utils::io::close(fd); if (ex) { std::rethrow_exception(ex); } diff --git a/include/dftracer/utils/utilities/fileio/lines/streaming_line_reader.h b/include/dftracer/utils/utilities/fileio/lines/streaming_line_reader.h index c9f8b32f..8e669a53 100644 --- a/include/dftracer/utils/utilities/fileio/lines/streaming_line_reader.h +++ b/include/dftracer/utils/utilities/fileio/lines/streaming_line_reader.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -25,7 +26,7 @@ namespace dftracer::utils::utilities::fileio::lines { * @code * auto config = StreamingLineReaderConfig() * .with_file("file.gz") - * .with_index("file.gz.idx") + * .with_index("trace-root/.dftindex") * .with_line_range(1, 100); * * auto range = StreamingLineReader::read(config); @@ -71,7 +72,7 @@ class StreamingLineReaderConfig { * appropriate line iterator. It supports: * - Indexed compressed files (.gz, .tar.gz) via Reader * - Plain text files - * - Automatic index file detection + * - Automatic `.dftindex` detection for compressed files * * Usage: * @code @@ -91,10 +92,10 @@ class StreamingLineReaderConfig { class StreamingLineReader { public: /** - * @brief Read lines from a file, auto-detecting format and index. + * @brief Read lines from a file, auto-detecting format and `.dftindex`. * * This method automatically: - * 1. Detects if an index file exists (.idx) + * 1. Detects if a `.dftindex` store exists * 2. Creates appropriate reader (indexed or plain) * 3. Returns a LineRange for streaming iteration * @@ -105,16 +106,18 @@ class StreamingLineReader { const std::string& file_path = config.file_path(); std::size_t start_line = config.start_line(); std::size_t end_line = config.end_line(); - const std::string& idx_path = config.index_path(); - // Check if index file exists - std::string actual_idx_path = - idx_path.empty() ? file_path + ".idx" : idx_path; - bool has_index = fs::exists(actual_idx_path); + const std::string& index_path = config.index_path(); + std::string actual_index_path = index_path; + if (actual_index_path.empty()) { + actual_index_path = + composites::dft::internal::determine_index_path(file_path, ""); + } + bool has_index = fs::exists(actual_index_path); DFTRACER_UTILS_LOG_DEBUG( - "StreamingLineReader::read - file=%s, idx_path_param=%s, " - "actual_idx=%s, has_index=%d", - file_path.c_str(), idx_path.c_str(), actual_idx_path.c_str(), + "StreamingLineReader::read - file=%s, index_path_param=%s, " + "actual_index=%s, has_index=%d", + file_path.c_str(), index_path.c_str(), actual_index_path.c_str(), has_index); // Check file extension to determine if it's compressed @@ -123,7 +126,7 @@ class StreamingLineReader { if (is_compressed && has_index) { auto iter_config = sources::IndexedFileLineIteratorConfig().with_file( - file_path, actual_idx_path); + file_path, actual_index_path); if (start_line > 0 && end_line > 0) { iter_config.with_line_range(start_line, end_line); } @@ -143,7 +146,7 @@ class StreamingLineReader { * @brief Read lines from a file using indexed reader. * * @param file_path Path to the compressed file - * @param idx_path Path to the index file + * @param config Indexed reader configuration * @param start_line Starting line (1-based, inclusive), 0 means start * @param end_line Ending line (1-based, inclusive), 0 means end * @return LineRange for streaming iteration @@ -185,24 +188,24 @@ class StreamingLineReader { static coro::AsyncGenerator read_async( const StreamingLineReaderConfig& config) { const std::string& file_path = config.file_path(); - const std::string& idx_path = config.index_path(); + const std::string& index_path = config.index_path(); bool is_compressed = is_compressed_format(file_path); // Only use the indexed path when an index was explicitly - // provided. Auto-discovering .idx files would silently + // provided. Auto-discovering `.dftindex` would silently // override callers that intentionally omit the index to // get single-pass streaming decompression. bool has_index = false; - std::string actual_idx_path; - if (!idx_path.empty()) { - actual_idx_path = idx_path; - has_index = fs::exists(actual_idx_path); + std::string actual_index_path; + if (!index_path.empty()) { + actual_index_path = index_path; + has_index = fs::exists(actual_index_path); } if (is_compressed && has_index) { auto iter_config = sources::IndexedFileLineIteratorConfig().with_file( - file_path, actual_idx_path); + file_path, actual_index_path); if (config.start_line() > 0 || config.end_line() > 0) { iter_config.with_line_range(config.start_line(), config.end_line()); @@ -238,7 +241,7 @@ class StreamingLineReader { * @brief Async read lines from compressed file without an index. * * Stream-decompresses the file and splits into lines in a single - * pass, avoiding the overhead of building a sidecar index. + * pass, avoiding the overhead of building a `.dftindex` store. */ static coro::AsyncGenerator read_streaming_gz_async( const std::string& file_path, std::size_t start_line = 0, diff --git a/include/dftracer/utils/utilities/fileio/types/chunk_spec.h b/include/dftracer/utils/utilities/fileio/types/chunk_spec.h index c71e7532..7f17b36c 100644 --- a/include/dftracer/utils/utilities/fileio/types/chunk_spec.h +++ b/include/dftracer/utils/utilities/fileio/types/chunk_spec.h @@ -16,7 +16,7 @@ namespace dftracer::utils::utilities::fileio { */ struct ChunkSpec { std::string file_path; - std::string idx_path; // Empty for plain text files + std::string index_path; // Empty for plain text files double size_mb; std::size_t start_byte; // Starting byte offset (0-based) std::size_t end_byte; // Ending byte offset (exclusive) @@ -26,13 +26,13 @@ struct ChunkSpec { ChunkSpec(std::string path, std::string idx, double mb, std::size_t start, std::size_t end) : file_path(std::move(path)), - idx_path(std::move(idx)), + index_path(std::move(idx)), size_mb(mb), start_byte(start), end_byte(end) {} bool operator==(const ChunkSpec& other) const { - return file_path == other.file_path && idx_path == other.idx_path && + return file_path == other.file_path && index_path == other.index_path && size_mb == other.size_mb && start_byte == other.start_byte && end_byte == other.end_byte; } @@ -54,7 +54,7 @@ struct hash { spec) const noexcept { ::dftracer::utils::utilities::hash::HasherUtility hasher; hasher.update(spec.file_path); - hasher.update(spec.idx_path); + hasher.update(spec.index_path); hasher.update(spec.size_mb); hasher.update(spec.start_byte); hasher.update(spec.end_byte); diff --git a/include/dftracer/utils/utilities/indexer/index_builder_utility.h b/include/dftracer/utils/utilities/indexer/index_builder_utility.h index 1036ca6a..cf86a8af 100644 --- a/include/dftracer/utils/utilities/indexer/index_builder_utility.h +++ b/include/dftracer/utils/utilities/indexer/index_builder_utility.h @@ -43,7 +43,7 @@ struct IndexBuildConfig { struct IndexBuildResult { std::string file_path; - std::string idx_path; + std::string index_path; bool success = false; bool was_skipped = false; bool index_created = false; diff --git a/include/dftracer/utils/utilities/indexer/index_database.h b/include/dftracer/utils/utilities/indexer/index_database.h index 2be2cc31..76846a7c 100644 --- a/include/dftracer/utils/utilities/indexer/index_database.h +++ b/include/dftracer/utils/utilities/indexer/index_database.h @@ -1,13 +1,16 @@ #ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_H #define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_H -#include +#include +#include #include #include #include #include +#include #include +#include #include #include #include @@ -18,8 +21,8 @@ namespace dftracer::utils::utilities::indexer { /** - * @brief Unified .idx SQLite database combining checkpoint, bloom filter, - * and manifest data in a single sidecar file. + * @brief Unified `.dftindex` RocksDB store combining checkpoint, bloom + * filter, manifest, and archive metadata. * * Schema is additive: call init_base_schema() always, then * init_bloom_schema() and/or init_manifest_schema() as needed. @@ -44,8 +47,27 @@ class IndexDatabase { using ChunkDimensionStats = composites::dft::indexing::ChunkDimensionStats; using ChunkDimensionStatsResult = composites::dft::indexing::ChunkDimensionStatsResult; - - explicit IndexDatabase(const std::string& idx_path); + using IndexerCheckpoint = internal::IndexerCheckpoint; + struct TarArchiveMetadata { + std::string archive_name; + std::uint64_t checkpoint_size = 0; + std::uint64_t total_lines = 0; + std::uint64_t total_uc_size = 0; + std::uint64_t total_files = 0; + }; + struct TarFileRecord { + std::string file_name; + std::uint64_t file_size = 0; + std::uint64_t file_mtime = 0; + char typeflag = '\0'; + std::uint64_t data_offset = 0; + std::uint64_t uncompressed_offset = 0; + }; + + explicit IndexDatabase( + const std::string& index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode open_mode = + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadWrite); IndexDatabase(const IndexDatabase&) = delete; IndexDatabase& operator=(const IndexDatabase&) = delete; @@ -66,11 +88,16 @@ class IndexDatabase { int get_or_create_file_info(std::string_view path, std::uint64_t file_hash); int get_file_info_id(std::string_view path) const; + std::optional get_file_hash(std::string_view path) const; // Convenience: resolve file path to file_id (handles logical path) int find_file(std::string_view file_path) const; // Metadata queries + void insert_file_metadata(int file_id, std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size); + std::uint64_t get_checkpoint_size(int file_id) const; std::uint64_t get_num_lines(int file_id) const; std::uint64_t get_max_bytes(int file_id) const; @@ -80,12 +107,7 @@ class IndexDatabase { void begin_transaction(); void commit_transaction(); - - sqlite3* db() const { return db_.get(); } - dftracer::utils::sqlite::SqliteDatabase& sql_db() { return db_; } - const dftracer::utils::sqlite::SqliteDatabase& sql_db() const { - return db_; - } + void rollback_transaction() noexcept; // ----------------------------------------------------------------------- // Bloom insert operations @@ -111,6 +133,7 @@ class IndexDatabase { void insert_chunk_statistics(int file_id, std::uint64_t checkpoint_idx, const ChunkStatistics& stats); + void insert_checkpoint(int file_id, const IndexerCheckpoint& checkpoint); void insert_index_dimension(int file_id, std::string_view dimension); @@ -121,6 +144,12 @@ class IndexDatabase { void insert_chunk_dimension_stats(int file_id, std::uint64_t checkpoint_idx, const ChunkDimensionStats& stats, std::size_t value_counts_cap = 4096); + void insert_tar_archive_metadata(int file_id, std::string_view archive_name, + std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size, + std::uint64_t total_files); + void insert_tar_file(int file_id, const TarFileRecord& record); // ----------------------------------------------------------------------- // Bloom query operations @@ -146,6 +175,19 @@ class IndexDatabase { std::vector query_chunk_statistics( int file_id) const; + bool find_checkpoint(int file_id, std::size_t target_offset, + IndexerCheckpoint& checkpoint) const; + std::vector query_checkpoints(int file_id) const; + std::vector query_checkpoints_for_line_range( + int file_id, std::uint64_t start_line, std::uint64_t end_line) const; + std::optional query_tar_archive_metadata( + int file_id) const; + std::vector query_tar_files(int file_id) const; + bool find_tar_file(int file_id, std::string_view file_name, + TarFileRecord& record) const; + std::vector query_tar_files_in_range( + int file_id, std::uint64_t start_offset, + std::uint64_t end_offset) const; TimeBounds query_time_bounds(int file_id) const; @@ -215,7 +257,12 @@ class IndexDatabase { void delete_metadata_lines(int file_id); private: - dftracer::utils::sqlite::SqliteDatabase db_; + void delete_file_data(int file_id); + + std::string db_path_; + dftracer::utils::rocksdb::RocksDatabase::OpenMode open_mode_; + std::shared_ptr db_; + std::unique_ptr txn_batch_; }; } // namespace dftracer::utils::utilities::indexer diff --git a/include/dftracer/utils/utilities/indexer/internal/indexer.h b/include/dftracer/utils/utilities/indexer/internal/indexer.h index 68016cb5..dbc28ff3 100644 --- a/include/dftracer/utils/utilities/indexer/internal/indexer.h +++ b/include/dftracer/utils/utilities/indexer/internal/indexer.h @@ -12,7 +12,7 @@ typedef void *dft_indexer_handle_t; // C API function declarations dft_indexer_handle_t dft_indexer_create(const char *gz_path, - const char *idx_path, + const char *index_path, uint64_t checkpoint_size, int force_rebuild); int dft_indexer_build(dft_indexer_handle_t indexer); @@ -65,7 +65,7 @@ class Indexer { virtual void set_visitors(VisitorList visitors) { (void)visitors; } // Metadata accessors - virtual const std::string &get_idx_path() const = 0; + virtual const std::string &get_index_path() const = 0; virtual const std::string &get_archive_path() const = 0; virtual std::uint64_t get_checkpoint_size() const = 0; virtual std::uint64_t get_max_bytes() const = 0; diff --git a/include/dftracer/utils/utilities/indexer/internal/indexer_factory.h b/include/dftracer/utils/utilities/indexer/internal/indexer_factory.h index 4ad42a21..63ec35ad 100644 --- a/include/dftracer/utils/utilities/indexer/internal/indexer_factory.h +++ b/include/dftracer/utils/utilities/indexer/internal/indexer_factory.h @@ -22,15 +22,15 @@ class IndexerFactory { * appropriate indexer. * * @param archive_path Path to the archive file (.gz or .tar.gz) - * @param idx_path Path to the index file (optional - will be auto-generated - * if empty) + * @param index_path Path to the `.dftindex` store (optional - will be + * auto-generated if empty) * @param checkpoint_size Checkpoint size in bytes * @param force Force rebuilding the index even if it exists * @return Shared pointer to the appropriate indexer, or nullptr if format * not supported */ static std::shared_ptr create( - const std::string &archive_path, const std::string &idx_path = "", + const std::string &archive_path, const std::string &index_path = "", std::uint64_t checkpoint_size = constants::indexer::DEFAULT_CHECKPOINT_SIZE, bool force = false); diff --git a/include/dftracer/utils/utilities/indexer/internal/scan_prefix.h b/include/dftracer/utils/utilities/indexer/internal/scan_prefix.h new file mode 100644 index 00000000..a0118303 --- /dev/null +++ b/include/dftracer/utils/utilities/indexer/internal/scan_prefix.h @@ -0,0 +1,38 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_SCAN_PREFIX_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_SCAN_PREFIX_H + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace dftracer::utils::utilities::indexer::internal { + +template +void scan_prefix_iterator(std::string_view error_message, + std::string_view prefix, + IteratorFactory&& make_iterator, Fn&& fn) { + auto it = make_iterator(); + for (it->Seek(::rocksdb::Slice(prefix.data(), prefix.size())); + it->Valid() && std::string_view(it->key().data(), it->key().size()) + .starts_with(prefix); + it->Next()) { + fn(*it); + } + + const auto status = it->status(); + if (!status.ok()) { + throw IndexerError( + IndexerError::Type::DATABASE_ERROR, + std::string(error_message) + ": " + status.ToString()); + } +} + +} // namespace dftracer::utils::utilities::indexer::internal + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_SCAN_PREFIX_H diff --git a/include/dftracer/utils/utilities/indexer/provenance_database.h b/include/dftracer/utils/utilities/indexer/provenance_database.h index 1b6dd75a..974eb771 100644 --- a/include/dftracer/utils/utilities/indexer/provenance_database.h +++ b/include/dftracer/utils/utilities/indexer/provenance_database.h @@ -1,10 +1,12 @@ #ifndef DFTRACER_UTILS_UTILITIES_INDEXER_PROVENANCE_DATABASE_H #define DFTRACER_UTILS_UTILITIES_INDEXER_PROVENANCE_DATABASE_H -#include +#include +#include #include #include +#include #include #include #include @@ -12,12 +14,11 @@ namespace dftracer::utils::utilities::indexer { /** - * @brief Manages the .pidx SQLite database for provenance indices. + * @brief Manages provenance data in the shared `.dftindex` RocksDB store. * - * Sidecar database that records the full reorganization provenance of + * Shared index data that records the full reorganization provenance of * an output file: which source files contributed, which checkpoints, * and which line ranges map to which output lines. - * Path convention: file.pfw.gz -> file.pfw.gz.pidx * * Schema: * - file_info: output file identity (path + hash) @@ -34,7 +35,10 @@ class ProvenanceDatabase { using ProvenanceSegment = composites::dft::indexing::queries::ProvenanceSegment; - explicit ProvenanceDatabase(const std::string& pidx_path); + explicit ProvenanceDatabase( + const std::string& provenance_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode open_mode = + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadWrite); ProvenanceDatabase(const ProvenanceDatabase&) = delete; ProvenanceDatabase& operator=(const ProvenanceDatabase&) = delete; @@ -49,24 +53,24 @@ class ProvenanceDatabase { int get_file_info_id(const std::string& path) const; - dftracer::utils::sqlite::SqliteDatabase& db() { return db_; } - const dftracer::utils::sqlite::SqliteDatabase& db() const { return db_; } - void begin_transaction(); void commit_transaction(); + void rollback_transaction() noexcept; // ----------------------------------------------------------------------- // Provenance insert operations // ----------------------------------------------------------------------- - void insert_info(std::string_view key, std::string_view value); + void insert_info(int file_info_id, std::string_view key, + std::string_view value); void insert_source(int file_info_id, int source_idx, std::string_view path, int num_checkpoints, std::string_view event_hash = ""); - void insert_group(std::string_view name, std::string_view predicate); + void insert_group(int file_info_id, std::string_view name, + std::string_view predicate); - void insert_segment(int source_idx, int source_checkpoint, + void insert_segment(int file_info_id, int source_idx, int source_checkpoint, int output_line_start, int output_line_end, int event_count); @@ -76,22 +80,26 @@ class ProvenanceDatabase { std::vector query_sources(int file_info_id) const; - std::vector query_segments(int source_idx) const; + std::vector query_segments(int file_info_id, + int source_idx) const; - std::vector query_all_segments() const; + std::vector query_all_segments(int file_info_id) const; - std::string query_info(std::string_view key) const; + std::string query_info(int file_info_id, std::string_view key) const; - std::string query_group_name() const; + std::string query_group_name(int file_info_id) const; - std::string query_group_predicate() const; + std::string query_group_predicate(int file_info_id) const; private: - dftracer::utils::sqlite::SqliteDatabase db_; + std::string db_path_; + dftracer::utils::rocksdb::RocksDatabase::OpenMode open_mode_; + std::shared_ptr db_; + std::unique_ptr txn_batch_; }; /** - * @brief Determine the provenance index (.pidx) path for a given data file. + * @brief Determine the shared `.dftindex` provenance root for a data file. */ std::string determine_provenance_index_path(const std::string& data_path, const std::string& index_dir = ""); diff --git a/include/dftracer/utils/utilities/reader/internal/reader.h b/include/dftracer/utils/utilities/reader/internal/reader.h index f23f88e6..bd464f00 100644 --- a/include/dftracer/utils/utilities/reader/internal/reader.h +++ b/include/dftracer/utils/utilities/reader/internal/reader.h @@ -18,7 +18,8 @@ typedef void *dft_indexer_handle_t; * Opaque handle for DFT reader */ typedef void *dft_reader_handle_t; -dft_reader_handle_t dft_reader_create(const char *gz_path, const char *idx_path, +dft_reader_handle_t dft_reader_create(const char *gz_path, + const char *index_path, size_t index_ckpt_size); dft_reader_handle_t dft_reader_create_with_indexer( dft_indexer_handle_t indexer); @@ -73,7 +74,7 @@ class Reader { virtual std::size_t get_max_bytes() const = 0; virtual std::size_t get_num_lines() const = 0; virtual const std::string &get_archive_path() const = 0; - virtual const std::string &get_idx_path() const = 0; + virtual const std::string &get_index_path() const = 0; virtual void set_buffer_size(std::size_t size) = 0; // Estimate line count for a byte range (for pre-allocation) diff --git a/include/dftracer/utils/utilities/reader/internal/reader_factory.h b/include/dftracer/utils/utilities/reader/internal/reader_factory.h index 9986467e..4413440b 100644 --- a/include/dftracer/utils/utilities/reader/internal/reader_factory.h +++ b/include/dftracer/utils/utilities/reader/internal/reader_factory.h @@ -19,7 +19,7 @@ class ReaderFactory { * Create a reader for any supported archive format (returns Reader) */ static std::shared_ptr create( - const std::string &archive_path, const std::string &idx_path, + const std::string &archive_path, const std::string &index_path, std::size_t index_ckpt_size = dftracer::utils::utilities::indexer:: internal::Indexer::DEFAULT_CHECKPOINT_SIZE); diff --git a/include/dftracer/utils/utilities/reader/trace_reader.h b/include/dftracer/utils/utilities/reader/trace_reader.h index b2723b0f..ccd2341a 100644 --- a/include/dftracer/utils/utilities/reader/trace_reader.h +++ b/include/dftracer/utils/utilities/reader/trace_reader.h @@ -20,7 +20,7 @@ using fileio::lines::Line; /// File-level configuration for TraceReader. struct TraceReaderConfig { std::string file_path; ///< Path to trace file (.pfw.gz or plain). - std::string index_dir; ///< Directory for .idx sidecar files. + std::string index_dir; ///< Directory containing `.dftindex` roots. std::size_t checkpoint_size = 32 * 1024 * 1024; ///< Checkpoint interval. bool auto_build_index = false; ///< Auto-build index if missing. std::size_t index_threshold = @@ -62,7 +62,7 @@ class TraceReader { coro::AsyncGenerator> read_raw( ReadConfig config = {}); - /// True if an .idx sidecar was found at construction time. + /// True if a `.dftindex` database was found at construction time. bool has_index() const; /// Decompressed size (0 if no index for compressed files). std::size_t get_max_bytes(); @@ -72,7 +72,7 @@ class TraceReader { private: TraceReaderConfig config_; bool has_index_ = false; - std::string idx_path_; + std::string index_path_; ArchiveFormat format_ = ArchiveFormat::UNKNOWN; std::size_t cached_max_bytes_ = 0; std::size_t cached_num_lines_ = 0; diff --git a/python/dftracer/utils/dftracer_utils_ext.pyi b/python/dftracer/utils/dftracer_utils_ext.pyi index b8c02533..c9324a74 100644 --- a/python/dftracer/utils/dftracer_utils_ext.pyi +++ b/python/dftracer/utils/dftracer_utils_ext.pyi @@ -17,12 +17,12 @@ class IndexerCheckpoint: num_lines: int class Indexer: - """Indexer for creating and managing gzip file indices.""" + """Indexer for creating and managing root-local ``.dftindex`` stores.""" def __init__( self, gz_path: str, - idx_path: Optional[str] = None, + index_path: Optional[str] = None, checkpoint_size: int = 1048576, force_rebuild: bool = False, build_bloom: bool = False, @@ -34,7 +34,8 @@ class Indexer: Args: gz_path: Path to the gzip trace file. - idx_path: Path to the index file. If None, uses gz_path + ".idx". + index_path: Path to the `.dftindex` store. If None, uses the + root-local `.dftindex` next to ``gz_path``. checkpoint_size: Checkpoint size in bytes for index building. force_rebuild: If True, rebuild the index even if it exists. build_bloom: If True, build bloom filter data in the index. @@ -56,7 +57,7 @@ class Indexer: ... def exists(self) -> bool: - """Check if the index file exists.""" + """Check if the `.dftindex` store exists.""" ... def get_max_bytes(self) -> int: @@ -75,14 +76,22 @@ class Indexer: """Find checkpoint for target offset.""" ... + def close(self) -> None: + """Release this Python wrapper's native indexer handle. + + This does not force-close the shared RocksDB instance for the same + ``.dftindex`` path. + """ + ... + @property def gz_path(self) -> str: """Get gzip path.""" ... @property - def idx_path(self) -> str: - """Get index path.""" + def index_path(self) -> str: + """Get the `.dftindex` path.""" ... @property @@ -92,12 +101,12 @@ class Indexer: @property def has_bloom(self) -> bool: - """Whether bloom filter data exists in the index sidecar.""" + """Whether bloom filter data exists in the `.dftindex` store.""" ... @property def has_manifest(self) -> bool: - """Whether manifest data exists in the index sidecar.""" + """Whether manifest data exists in the `.dftindex` store.""" ... def __enter__(self) -> "Indexer": @@ -110,7 +119,11 @@ class Indexer: exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: - """Exit the runtime context for the with statement.""" + """Release this Python wrapper on context exit. + + This does not force-close the shared RocksDB instance for the same + ``.dftindex`` path. + """ ... # ========== JSON ========== @@ -321,7 +334,7 @@ class TraceReader: Args: file_path: Path to the trace file (.pfw.gz or plain text). - index_dir: Directory to search for ``.idx`` sidecar files. + index_dir: Directory to search for ``.dftindex`` stores. Empty string (default) searches next to the trace file. checkpoint_size: Checkpoint interval in bytes for index building (default 32 MB). @@ -514,7 +527,7 @@ class TraceReader: @property def index_dir(self) -> str: - """Directory searched for index sidecar files.""" + """Directory searched for `.dftindex` stores.""" ... @property diff --git a/setup.py b/setup.py index 5b69509e..1087223d 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ from __future__ import annotations from setuptools import setup - from setuptools_scm import ScmVersion diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7f68597a..14b9493a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,7 +5,8 @@ add_rpath() need_zlib() -need_sqlite3() +need_lz4() +need_rocksdb() need_argparse() need_ghc_filesystem() need_cpplogger() @@ -31,6 +32,7 @@ set(DFTRACER_UTILS_CORE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/constants.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/format_detector.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/filesystem.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/env.cpp # Utilities ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/utils/timer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/utils/string.cpp @@ -56,12 +58,12 @@ set(DFTRACER_UTILS_CORE_SOURCES # Tasks ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/tasks/task.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/tasks/task_result.cpp - # SQLite - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/sqlite/error.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/sqlite/database.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/sqlite/statement.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/sqlite/vfs.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/sqlite/async.cpp + # RocksDB + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/database.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/filesystem.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/key_codec.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/async.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/db_manager.cpp ) # Conditionally add io_uring backend sources @@ -126,37 +128,6 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_bloom_filter.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/insert_file_bloom_filter.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_statistics.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/insert_index_dimension.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/insert_hash_resolution.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters_batch.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filter.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filters_batch.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_index_dimensions.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_hash_by_resolved.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_resolved_by_hash.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_bloom_filters.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/delete_file_bloom_filter.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_statistics.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_time_bounds.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_statistics.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_dimension_stats.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_dimension_stats.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_dimension_stats.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/delete_hash_resolutions.cpp - # Manifest index queries - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/insert_event_range.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/insert_metadata_lines.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_event_ranges.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_metadata_lines.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/delete_event_ranges.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/delete_metadata_lines.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/insert_provenance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/queries/query_provenance.cpp - # DFT Reorganization ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/reconstruction_planner.cpp @@ -197,33 +168,12 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES # Indexer factory ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/indexer_factory.cpp # GZIP indexer - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/delete_file_record.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_checkpoint_record.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_metadata_record.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_record.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoints.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/query_file_id.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/query_max_bytes.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/query_num_lines.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/query_schema_validity.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/query_stored_file_info.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint_size.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/gzip/constants.cpp # TAR indexer ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/tar_parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/constants.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/queries/insert_file_record.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_record.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_metadata_record.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_file_record.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_checkpoint_record.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/queries/query_archive_id.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_files.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_checkpoints.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/tar/queries/query_metadata.cpp # Reader ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/reader/internal/reader_c.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/reader/internal/stream_c.cpp @@ -279,14 +229,6 @@ set(PKG_CONFIG_LIBS_PRIVATE "") # Add Threads (always required) set(PKG_CONFIG_LIBS_PRIVATE "${PKG_CONFIG_LIBS_PRIVATE} -lpthread") -# Only add sqlite3 to pkg-config requirements if it was found on system -if(SQLite3_FOUND AND NOT SQLite3_CPM) - set(PKG_CONFIG_REQUIRES "${PKG_CONFIG_REQUIRES} sqlite3") - set(PKG_CONFIG_LIBS_PRIVATE "${PKG_CONFIG_LIBS_PRIVATE} -lsqlite3") -else() - set(PKG_CONFIG_LIBS_PRIVATE "${PKG_CONFIG_LIBS_PRIVATE} -lsqlite3") -endif() - # Only add zlib to pkg-config requirements if it was found on system if(ZLIB_FOUND AND NOT ZLIB_CPM) set(PKG_CONFIG_LIBS_PRIVATE "${PKG_CONFIG_LIBS_PRIVATE} -lz") @@ -373,7 +315,7 @@ foreach(variant shared static) # Link dependencies using helper functions link_cpp_logger(dftracer_utils_core_${variant} ${VARIANT_UPPER}) link_yyjson(dftracer_utils_core_${variant} ${VARIANT_UPPER}) - link_sqlite3(dftracer_utils_core_${variant} ${VARIANT_UPPER}) + link_rocksdb(dftracer_utils_core_${variant} ${VARIANT_UPPER}) link_zlib(dftracer_utils_core_${variant} ${VARIANT_UPPER}) # Add stdfs if needed @@ -822,6 +764,7 @@ if(DFTRACER_UTILS_BUILD_BINARIES) set_target_properties( ${bin_exec} PROPERTIES OUTPUT_NAME "${bin_exec}" RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + target_add_rpath(${bin_exec}) # Link to unified library (which includes both core and utilities) target_link_libraries(${bin_exec} PRIVATE dftracer_utils argparse::argparse) @@ -867,6 +810,7 @@ if(DFTRACER_UTILS_BUILD_EXAMPLES) add_executable(${example_name} ${example}) set_target_properties(${example_name} PROPERTIES OUTPUT_NAME "${example_name}") + target_add_rpath(${example_name}) target_link_libraries( ${example_name} PRIVATE dftracer_utils argparse::argparse) diff --git a/src/dftracer/utils/binaries/dftracer_aggregator.cpp b/src/dftracer/utils/binaries/dftracer_aggregator.cpp index 1a86b51a..c511cabe 100644 --- a/src/dftracer/utils/binaries/dftracer_aggregator.cpp +++ b/src/dftracer/utils/binaries/dftracer_aggregator.cpp @@ -265,6 +265,17 @@ static coro::CoroTask run_aggregator(argparse::ArgumentParser& program) { EventAggregatorUtility merger; std::atomic global_chunk_idx{0}; + if (force_rebuild && !input_files.empty()) { + const std::string shared_index_path = + composites::dft::internal::determine_index_path(input_files.front(), + index_dir); + if (fs::exists(shared_index_path)) { + DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s", + shared_index_path.c_str()); + fs::remove_all(shared_index_path); + } + } + // Streaming aggregation: file producers -> chunk workers -> merger auto streaming_task = make_task( [&](CoroScope& ctx) -> coro::CoroTask { @@ -283,23 +294,24 @@ static coro::CoroTask run_aggregator(argparse::ArgumentParser& program) { -> coro::CoroTask { [[maybe_unused]] auto producer_guard = ch.guard(); // Build index - std::string idx_path = + std::string index_path = composites::dft::internal::determine_index_path( file_path, index_dir); auto idx_input = indexer::IndexBuildConfig::for_file(file_path) .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(force_rebuild) + .with_force_rebuild(false) .with_index_dir(index_dir); - indexer::IndexBuilderUtility{}.process(idx_input); + co_await indexer::IndexBuilderUtility{}.process( + idx_input); // Collect metadata auto meta_input = composites::dft::MetadataCollectorUtilityInput:: from_file(file_path) .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(force_rebuild) - .with_index(idx_path); + .with_force_rebuild(false) + .with_index(index_path); auto metadata = co_await composites::dft::MetadataCollectorUtility{} .process(meta_input); diff --git a/src/dftracer/utils/binaries/dftracer_comparator.cpp b/src/dftracer/utils/binaries/dftracer_comparator.cpp index 2b2743a5..a9cd9750 100644 --- a/src/dftracer/utils/binaries/dftracer_comparator.cpp +++ b/src/dftracer/utils/binaries/dftracer_comparator.cpp @@ -79,7 +79,7 @@ static coro::CoroTask run_aggregation( -> coro::CoroTask { [[maybe_unused]] auto producer_guard = ch.guard(); - std::string idx_path = + std::string index_path = composites::dft::internal::determine_index_path( file_path, index_dir); @@ -88,7 +88,7 @@ static coro::CoroTask run_aggregation( from_file(file_path) .with_checkpoint_size(checkpoint_size) .with_force_rebuild(force_rebuild) - .with_index(idx_path); + .with_index(index_path); auto metadata = co_await composites::dft::MetadataCollectorUtility{} .process(meta_input); @@ -288,8 +288,19 @@ static coro::CoroTask run_comparator(argparse::ArgumentParser& program) { co_return 1; } - // Build indexes upfront so parallel aggregation doesn't race on .idx + // Build indexes upfront so parallel aggregation doesn't race on + // `.dftindex`. { + if (config.force_rebuild && !baseline_files.empty()) { + const std::string shared_index_path = + composites::dft::internal::determine_index_path( + baseline_files.front(), config.index_dir); + if (fs::exists(shared_index_path)) { + DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s", + shared_index_path.c_str()); + fs::remove_all(shared_index_path); + } + } std::unordered_set seen; std::vector all_files; for (const auto& f : baseline_files) { @@ -306,7 +317,7 @@ static coro::CoroTask run_comparator(argparse::ArgumentParser& program) { idx_configs.push_back( indexer::IndexBuildConfig::for_file(file_path) .with_checkpoint_size(config.checkpoint_size) - .with_force_rebuild(config.force_rebuild) + .with_force_rebuild(false) .with_index_dir(config.index_dir)); } std::vector> idx_tasks; diff --git a/src/dftracer/utils/binaries/dftracer_event_count.cpp b/src/dftracer/utils/binaries/dftracer_event_count.cpp index 27b00836..c5e91c21 100644 --- a/src/dftracer/utils/binaries/dftracer_event_count.cpp +++ b/src/dftracer/utils/binaries/dftracer_event_count.cpp @@ -127,7 +127,6 @@ static coro::CoroTask run_event_count(argparse::ArgumentParser& program) { auto* total_events_ptr = &total_events; auto* files_processed_ptr = &files_processed; auto* is_approximate_ptr = &is_approximate; - auto file_chan = coro::make_channel(executor_threads * 2); @@ -162,19 +161,19 @@ static coro::CoroTask run_event_count(argparse::ArgumentParser& program) { co_await builder.process(config); // Read event count from index - std::string idx_path = + std::string index_path = fp + constants::indexer::EXTENSION; if (!index_dir.empty()) { auto fname = fs::path(fp).filename(); - idx_path = + index_path = (fs::path(index_dir) / fname).string() + constants::indexer::EXTENSION; } - if (fs::exists(idx_path)) { + if (fs::exists(index_path)) { try { utilities::indexer::IndexDatabase db( - idx_path); + index_path); int fid = db.find_file(fp); if (fid >= 0) { if (!db.has_bloom_data(fid)) { diff --git a/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp b/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp index 19540409..eb3f4c45 100644 --- a/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp +++ b/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp @@ -241,7 +241,7 @@ static coro::CoroTask run_verify( std::string abs_path = fs::absolute(file_path).string(); // 1. Build gzip index - std::string idx_path = internal::determine_index_path(abs_path, ""); + std::string index_path = internal::determine_index_path(abs_path, ""); auto idx_input = IndexBuildConfig::for_file(abs_path) .with_checkpoint_size(ckpt_size) .with_force_rebuild(true); @@ -251,7 +251,7 @@ static coro::CoroTask run_verify( auto meta_input = MetadataCollectorUtilityInput::from_file(abs_path) .with_checkpoint_size(ckpt_size) .with_force_rebuild(false) - .with_index(idx_path); + .with_index(index_path); auto metadata = co_await MetadataCollectorUtility{}.process(meta_input); if (!metadata.success) { @@ -307,7 +307,7 @@ static coro::CoroTask run_verify( for (const auto& chunk : chunks) { ChunkIndexerInput ci; ci.with_file_path(abs_path) - .with_idx_path(idx_path) + .with_index_path(index_path) .with_checkpoint_size(ckpt_size) .with_checkpoint_idx(chunk.idx) .with_byte_range(chunk.start, chunk.end) diff --git a/src/dftracer/utils/binaries/dftracer_index.cpp b/src/dftracer/utils/binaries/dftracer_index.cpp index 47488222..249f262a 100644 --- a/src/dftracer/utils/binaries/dftracer_index.cpp +++ b/src/dftracer/utils/binaries/dftracer_index.cpp @@ -130,7 +130,6 @@ static coro::CoroTask run_index(argparse::ArgumentParser& program) { auto* all_dims_ptr = &all_dimensions; auto* files_ptr = &input_files; auto* index_dir_ptr = &index_dir; - // Bounded fan-out: channel limits concurrent file processing // to avoid memory pressure from unbounded coroutine spawning. auto file_chan = @@ -247,7 +246,7 @@ int main(int argc, char** argv) { DFTRACER_UTILS_PACKAGE_VERSION); program.add_description( "Build per-chunk bloom filter indices for DFTracer trace files. " - "Creates .idx sidecar databases enabling fast chunk-skipping " + "Creates root-local .dftindex databases enabling fast chunk-skipping " "queries."); program.add_argument("-d", "--directory") @@ -278,7 +277,7 @@ int main(int argc, char** argv) { static_cast(dftracer_utils_hardware_concurrency())); program.add_argument("--index-dir") - .help("Directory to store index files (default: same as data files)") + .help("Directory where .dftindex stores are created") .default_value(""); program.add_argument("--expected-entries") @@ -300,7 +299,7 @@ int main(int argc, char** argv) { program.add_argument("--manifest") .help( - "Also build .idx manifest index " + "Also build manifest data in the .dftindex store " "(per-checkpoint event line routing)") .flag(); diff --git a/src/dftracer/utils/binaries/dftracer_info.cpp b/src/dftracer/utils/binaries/dftracer_info.cpp index 7c2dc03d..7c4a7191 100644 --- a/src/dftracer/utils/binaries/dftracer_info.cpp +++ b/src/dftracer/utils/binaries/dftracer_info.cpp @@ -49,7 +49,7 @@ static std::string format_size(std::uint64_t bytes) { return oss.str(); } -/// Fast path: read metadata from the .idx database. +/// Fast path: read metadata from the `.dftindex` database. /// Returns success=false if index doesn't exist, letting the caller /// fall back to direct_scan_info for small/unindexed files. static MetadataCollectorUtilityOutput index_based_info( @@ -60,13 +60,13 @@ static MetadataCollectorUtilityOutput index_based_info( meta.file_path = file_path; try { - std::string idx_path = file_path + constants::indexer::EXTENSION; - if (!fs::exists(idx_path)) { + std::string index_path = file_path + constants::indexer::EXTENSION; + if (!fs::exists(index_path)) { meta.success = false; return meta; } - IndexDatabase db(idx_path); + IndexDatabase db(index_path); int fid = db.find_file(file_path); if (fid < 0) { meta.success = false; @@ -78,6 +78,7 @@ static MetadataCollectorUtilityOutput index_based_info( meta.num_lines = db.get_num_lines(fid); meta.uncompressed_size = db.get_max_bytes(fid); meta.valid_events = db.get_total_events(fid); + meta.index_path = index_path; meta.has_index = true; meta.index_valid = true; meta.size_mb = @@ -97,7 +98,7 @@ static MetadataCollectorUtilityOutput index_based_info( } /// One streaming decompress pass, count lines with JSON validation, -/// no sidecar index created. +/// without creating a `.dftindex` store. static coro::CoroTask direct_scan_info( std::string file_path) { using dftracer::utils::utilities::fileio::lines::sources:: @@ -214,9 +215,9 @@ static void print_file_info(const MetadataCollectorUtilityOutput& info, if (info.format == ArchiveFormat::GZIP || info.format == ArchiveFormat::TAR_GZ) { std::printf("\nIndex Information:\n"); - std::printf(" Index File: %s\n", info.idx_path.empty() - ? "(auto-generated)" - : info.idx_path.c_str()); + std::printf(" Index Store: %s\n", info.index_path.empty() + ? "(auto-generated)" + : info.index_path.c_str()); std::printf(" Index Status: %s\n", info.has_index ? (info.index_valid ? "Valid" : "Invalid") : "Not Created"); @@ -239,8 +240,8 @@ static void print_file_info(const MetadataCollectorUtilityOutput& info, (unsigned long long)lines_per_checkpoint); // Calculate index overhead - if (fs::exists(info.idx_path)) { - std::uint64_t index_size = fs::file_size(info.idx_path); + if (fs::exists(info.index_path)) { + std::uint64_t index_size = fs::file_size(info.index_path); double index_overhead = 100.0 * static_cast(index_size) / static_cast(info.compressed_size); @@ -404,7 +405,7 @@ int main(int argc, char** argv) { } } - // Small files skip indexing to avoid creating sidecar files on + // Small files skip indexing to avoid creating `.dftindex` stores on // metadata-sensitive filesystems (e.g. Lustre). static constexpr std::size_t INDEX_SIZE_THRESHOLD = constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD; diff --git a/src/dftracer/utils/binaries/dftracer_organize.cpp b/src/dftracer/utils/binaries/dftracer_organize.cpp index a08db38b..1a1f9259 100644 --- a/src/dftracer/utils/binaries/dftracer_organize.cpp +++ b/src/dftracer/utils/binaries/dftracer_organize.cpp @@ -185,11 +185,11 @@ coro::CoroTask run_organize(const std::string& output_dir, std::printf(" Source files processed: %zu\n", router_result.source_files_processed); - // Step 4: Build sidecars for output chunk files + // Step 4: Build `.dftindex` stores for output chunk files. if (!router_result.output_files.empty()) { - std::printf("Step 4: Building sidecars...\n"); + std::printf("Step 4: Building .dftindex stores...\n"); auto pipeline_config = PipelineConfig() - .with_name("Organize: Build Sidecars") + .with_name("Organize: Build Index Stores") .with_compute_threads(executor_threads) .with_watchdog(false); @@ -197,7 +197,7 @@ coro::CoroTask run_organize(const std::string& output_dir, auto* output_files_ptr = &router_result.output_files; - auto sidecar_task = make_task( + auto index_store_task = make_task( [output_files_ptr, output_dir, checkpoint_size](CoroScope& ctx) -> coro::CoroTask { co_await ctx.scope( @@ -224,8 +224,8 @@ coro::CoroTask run_organize(const std::string& output_dir, }, "BuildSidecars"); - pipeline.set_source(sidecar_task); - pipeline.set_destination(sidecar_task); + pipeline.set_source(index_store_task); + pipeline.set_destination(index_store_task); pipeline.execute(); } @@ -293,7 +293,7 @@ int main(int argc, char** argv) { indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE)); program.add_argument("--index-dir") - .help("Directory for sidecar files") + .help("Directory for .dftindex stores") .default_value(""); program.add_argument("-f", "--force") diff --git a/src/dftracer/utils/binaries/dftracer_reader.cpp b/src/dftracer/utils/binaries/dftracer_reader.cpp index a15d203d..2e923f24 100644 --- a/src/dftracer/utils/binaries/dftracer_reader.cpp +++ b/src/dftracer/utils/binaries/dftracer_reader.cpp @@ -24,31 +24,33 @@ using namespace dftracer::utils::utilities::indexer::internal; using namespace dftracer::utils::utilities::reader::internal; static coro::CoroTask run_reader(const std::string &gz_path, - const std::string &idx_path, + const std::string &index_path, std::size_t checkpoint_size, bool force_rebuild, bool check_rebuild, const std::string &read_mode, std::size_t read_buffer_size, int64_t start, int64_t end) { + const std::string index_root = normalize_index_root(index_path); + // Create indexer first std::shared_ptr indexer; try { - // check if idx file exists - if (!fs::exists(idx_path)) { + // Check whether the root-local .dftindex store already exists. + if (!fs::exists(index_root)) { if (check_rebuild) { DFTRACER_UTILS_LOG_ERROR( - "Index file '%s' does not exist, cannot check", - idx_path.c_str()); + "Index store '%s' does not exist, cannot check", + index_root.c_str()); co_return 1; } - DFTRACER_UTILS_LOG_DEBUG("Index file '%s' does not exist", - idx_path.c_str()); - DFTRACER_UTILS_LOG_DEBUG("%s", "Will create new index file"); + DFTRACER_UTILS_LOG_DEBUG("Index store '%s' does not exist", + index_root.c_str()); + DFTRACER_UTILS_LOG_DEBUG("%s", "Will create new index store"); force_rebuild = true; } // Use IndexerFactory to create appropriate indexer - indexer = IndexerFactory::create(gz_path, idx_path, checkpoint_size, + indexer = IndexerFactory::create(gz_path, index_path, checkpoint_size, force_rebuild); if (check_rebuild) { @@ -60,15 +62,15 @@ static coro::CoroTask run_reader(const std::string &gz_path, } if (force_rebuild) { - if (fs::exists(idx_path)) { - DFTRACER_UTILS_LOG_DEBUG("Removing existing index: %s", - idx_path.c_str()); - fs::remove(idx_path); + if (fs::exists(index_root)) { + DFTRACER_UTILS_LOG_DEBUG("Removing existing index store: %s", + index_root.c_str()); + fs::remove_all(index_root); } - // Recreate indexer after removing old index - indexer = IndexerFactory::create(gz_path, idx_path, checkpoint_size, - true); - DFTRACER_UTILS_LOG_INFO("Building index for file: %s", + // Recreate the store after removing the old .dftindex root. + indexer = IndexerFactory::create(gz_path, index_path, + checkpoint_size, true); + DFTRACER_UTILS_LOG_INFO("Building index store for file: %s", gz_path.c_str()); co_await indexer->build_async(); } @@ -184,7 +186,7 @@ int main(int argc, char **argv) { .help("Compressed file to process (GZIP, TAR.GZ)") .required(); program.add_argument("-i", "--index") - .help("Index file to use") + .help("Path to the .dftindex store to use") .default_value(""); program.add_argument("-s", "--start") .help("Start position in bytes") @@ -201,9 +203,11 @@ int main(int argc, char **argv) { .default_value( static_cast(Indexer::DEFAULT_CHECKPOINT_SIZE)); program.add_argument("-f", "--force-rebuild") - .help("Force rebuild index") + .help("Force rebuild the .dftindex store") + .flag(); + program.add_argument("--check") + .help("Check if the .dftindex store is valid") .flag(); - program.add_argument("--check").help("Check if index is valid").flag(); program.add_argument("--read-buffer-size") .help("Size of the read buffer in bytes (default: 1MB)") .default_value(1 * 1024 * 1024) @@ -213,7 +217,7 @@ int main(int argc, char **argv) { .default_value("bytes") .choices("bytes", "line_bytes", "lines"); program.add_argument("--index-dir") - .help("Directory to store index files (default: system temp directory)") + .help("Directory to store root-local .dftindex directories") .default_value(""); try { @@ -260,11 +264,8 @@ int main(int argc, char **argv) { } ::close(test_fd); - std::string idx_path; - if (!index_path.empty()) { - idx_path = index_path; - } else { - idx_path = utilities::composites::dft::internal::determine_index_path( + if (index_path.empty()) { + index_path = utilities::composites::dft::internal::determine_index_path( gz_path, index_dir); } @@ -277,7 +278,7 @@ int main(int argc, char **argv) { : "UNKNOWN"); #endif - return run_reader(gz_path, idx_path, checkpoint_size, force_rebuild, + return run_reader(gz_path, index_path, checkpoint_size, force_rebuild, check_rebuild, read_mode, read_buffer_size, start, end) .get(); } diff --git a/src/dftracer/utils/binaries/dftracer_reconstruct.cpp b/src/dftracer/utils/binaries/dftracer_reconstruct.cpp index d72aaeed..f40d0db1 100644 --- a/src/dftracer/utils/binaries/dftracer_reconstruct.cpp +++ b/src/dftracer/utils/binaries/dftracer_reconstruct.cpp @@ -207,7 +207,7 @@ static coro::CoroTask run_reconstruct(const std::string& directory, -> coro::CoroTask { co_await s.receive(permits); try { - std::string idx_path = + std::string index_path = internal::determine_index_path(reorg_file_copy, ""); @@ -215,14 +215,14 @@ static coro::CoroTask run_reconstruct(const std::string& directory, auto meta_input = MetadataCollectorUtilityInput::from_file( reorg_file_copy) - .with_index(idx_path) + .with_index(index_path) .with_checkpoint_size(checkpoint_size); auto meta = co_await meta_collector.process(meta_input); auto reader_input = IndexedReadInput::from_file(reorg_file_copy) - .with_index(idx_path) + .with_index(index_path) .with_checkpoint_size(checkpoint_size); IndexedFileReaderUtility reader_utility; auto reader = diff --git a/src/dftracer/utils/binaries/dftracer_server.cpp b/src/dftracer/utils/binaries/dftracer_server.cpp index d4671143..c740b21c 100644 --- a/src/dftracer/utils/binaries/dftracer_server.cpp +++ b/src/dftracer/utils/binaries/dftracer_server.cpp @@ -34,7 +34,7 @@ static coro::CoroTask run_server(argparse::ArgumentParser& program) { program.get("--executor-threads"); // When no explicit index dir is given, default to the trace - // directory so sidecar files (.idx) persist across restarts + // directory so `.dftindex` stores persist across restarts // and don't need to be rebuilt every time. if (index_dir.empty()) { index_dir = directory; @@ -132,7 +132,7 @@ int main(int argc, char** argv) { program.add_argument("--index-dir") .help( - "Directory for bloom/checkpoint index files (default: same as " + "Directory for root-local .dftindex stores (default: same as " "--directory)") .default_value(""); diff --git a/src/dftracer/utils/binaries/dftracer_split.cpp b/src/dftracer/utils/binaries/dftracer_split.cpp index b57f54a8..8be38958 100644 --- a/src/dftracer/utils/binaries/dftracer_split.cpp +++ b/src/dftracer/utils/binaries/dftracer_split.cpp @@ -232,6 +232,17 @@ int main(int argc, char** argv) { DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size()); + if (force) { + const std::string shared_index_path = + utilities::composites::dft::internal::determine_index_path( + input_files.front(), index_dir); + if (fs::exists(shared_index_path)) { + DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s", + shared_index_path.c_str()); + fs::remove_all(shared_index_path); + } + } + // Phase 2: Build TaskGraph for file processing auto graph = TaskGraph::builder( {.name = "DFTracerSplit", .max_concurrency = executor_threads}); @@ -246,7 +257,7 @@ int main(int argc, char** argv) { const auto& file_path = (*input_files_ptr)[idx]; // Determine index path - std::string idx_path = + std::string index_path = utilities::composites::dft::internal::determine_index_path( file_path, index_dir); @@ -254,17 +265,18 @@ int main(int argc, char** argv) { auto idx_input = utilities::indexer::IndexBuildConfig::for_file(file_path) .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(force) + .with_force_rebuild(false) .with_index_dir(index_dir); - utilities::indexer::IndexBuilderUtility{}.process(idx_input); + co_await utilities::indexer::IndexBuilderUtility{}.process( + idx_input); // Collect metadata auto meta_input = utilities::composites::dft::MetadataCollectorUtilityInput:: from_file(file_path) .with_checkpoint_size(checkpoint_size) - .with_force_rebuild(force) - .with_index(idx_path) + .with_force_rebuild(false) + .with_index(index_path) .with_compute_hash(verify); co_return co_await utilities::composites::dft:: diff --git a/src/dftracer/utils/binaries/dftracer_stats.cpp b/src/dftracer/utils/binaries/dftracer_stats.cpp index 4165d5ff..5b9e322d 100644 --- a/src/dftracer/utils/binaries/dftracer_stats.cpp +++ b/src/dftracer/utils/binaries/dftracer_stats.cpp @@ -55,7 +55,7 @@ using dftracer::utils::utilities::indexer::IndexBuilderUtility; using dftracer::utils::utilities::indexer::IndexDatabase; // Files below this compressed size are scanned directly without building -// sidecar index files (.idx). At 8 MB compressed (~160 MB +// `.dftindex` stores. At 8 MB compressed (~160 MB // uncompressed with typical 20x JSON compression), a file has only a // handful of 32 MB checkpoints — the indexing overhead exceeds the // benefit of bloom-filter skip. @@ -456,7 +456,7 @@ static void print_text_detailed( std::printf("\n"); } -// Direct-scan a small .pfw.gz file without any sidecar index. +// Direct-scan a small .pfw.gz file without any persisted index store. // Streams lines via async_streaming_gz_lines, parses each with yyjson, // and accumulates stats via ChunkStatistics::update_from_event(). static coro::CoroTask direct_scan_trace_statistics( @@ -657,7 +657,7 @@ static coro::CoroTask direct_scan_detailed_statistics( // Per-chunk scanning coroutine for parallel detailed stats. // Scans a single chunk and merges results into shared file_detailed. static coro::CoroTask scan_chunk_detailed( - std::string file_path, std::string idx_path, std::size_t checkpoint_size, + std::string file_path, std::string index_path, std::size_t checkpoint_size, std::size_t file_size, std::size_t num_ckpts, std::uint64_t ckpt_idx, const std::vector* filter_names_ptr, const std::vector* filter_cats_ptr, @@ -676,7 +676,7 @@ static coro::CoroTask scan_chunk_detailed( ChunkDetailScanInput scan_input; scan_input.file_path = file_path; - scan_input.idx_path = idx_path; + scan_input.index_path = index_path; scan_input.checkpoint_size = checkpoint_size; scan_input.start_byte = start_byte; scan_input.end_byte = end_byte; @@ -709,12 +709,13 @@ static coro::CoroTask process_file_detailed( DetailedStatistics* aggregate_detailed_ptr, std::mutex* aggregate_mutex_ptr, std::mutex* output_mutex_ptr, std::vector>* json_results_ptr) { - std::string idx_path = internal::determine_index_path(file_path, index_dir); + std::string index_path = + internal::determine_index_path(file_path, index_dir); auto meta_input = MetadataCollectorUtilityInput::from_file(file_path) .with_checkpoint_size(checkpoint_size) .with_force_rebuild(false) - .with_index(idx_path); + .with_index(index_path); auto metadata = co_await MetadataCollectorUtility{}.process(meta_input); if (!metadata.success) { @@ -731,9 +732,9 @@ static coro::CoroTask process_file_detailed( std::vector candidate_checkpoints; std::uint64_t total_checkpoints = (num_ckpts == 0) ? 1 : num_ckpts; - if (query_ptr && fs::exists(idx_path)) { + if (query_ptr && fs::exists(index_path)) { try { - ChunkPrunerInput pruner_input{idx_path, file_path, *query_ptr, + ChunkPrunerInput pruner_input{index_path, file_path, *query_ptr, nullptr}; ChunkPrunerUtility pruner; auto pruner_output = co_await pruner.process(pruner_input); @@ -766,19 +767,19 @@ static coro::CoroTask process_file_detailed( total_checkpoints - candidate_checkpoints.size(); auto chunk_mutex = std::make_shared(); - co_await fctx.scope([file_path, idx_path, checkpoint_size, file_size, + co_await fctx.scope([file_path, index_path, checkpoint_size, file_size, num_ckpts, filter_names_ptr, filter_cats_ptr, group_by_ptr, file_detailed, chunk_mutex, candidates = std::move(candidate_checkpoints)]( CoroScope& chunk_scope) -> coro::CoroTask { for (auto ckpt_idx : candidates) { chunk_scope.spawn( - [file_path, idx_path, checkpoint_size, file_size, num_ckpts, + [file_path, index_path, checkpoint_size, file_size, num_ckpts, ckpt_idx, filter_names_ptr, filter_cats_ptr, group_by_ptr, file_detailed, chunk_mutex](CoroScope& /*cctx*/) -> coro::CoroTask { co_return co_await scan_chunk_detailed( - file_path, idx_path, checkpoint_size, file_size, + file_path, index_path, checkpoint_size, file_size, num_ckpts, ckpt_idx, filter_names_ptr, filter_cats_ptr, group_by_ptr, file_detailed, chunk_mutex); }); @@ -788,9 +789,9 @@ static coro::CoroTask process_file_detailed( // Hash resolution (sequential, all chunks done) std::unordered_map hash_resolutions; - if (needs_hash_resolution && fs::exists(idx_path)) { + if (needs_hash_resolution && fs::exists(index_path)) { try { - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(index_path); auto logical = utilities::indexer::internal::get_logical_path(file_path); int file_info_id = idx_db.get_file_info_id(logical); @@ -996,11 +997,11 @@ static coro::CoroTask run_stats(argparse::ArgumentParser& program) { std::vector files_needing_index; std::vector small_files; for (const auto& file_path : files) { - std::string idx_path = + std::string index_path = internal::determine_index_path(file_path, index_dir); - if (fs::exists(idx_path)) { + if (fs::exists(index_path)) { try { - IndexDatabase db(idx_path); + IndexDatabase db(index_path); auto logical = utilities::indexer::internal::get_logical_path(file_path); int fid = db.get_file_info_id(logical); @@ -1370,8 +1371,8 @@ int main(int argc, char** argv) { DFTRACER_UTILS_PACKAGE_VERSION); program.add_description( "Display statistics for DFTracer trace files from pre-built " - "index (.idx) databases. Auto-builds indices if missing. " - "Zero-cost reads: only SQLite metadata, no decompression."); + ".dftindex databases. Auto-builds indexes if missing. " + "Zero-cost reads from RocksDB metadata, no decompression."); program.add_argument("--files") .help("Trace files to inspect (.pfw, .pfw.gz)") @@ -1383,7 +1384,7 @@ int main(int argc, char** argv) { .default_value(""); program.add_argument("--index-dir") - .help("Directory where .idx index files are stored") + .help("Directory where .dftindex stores are created") .default_value(""); program.add_argument("--json").help("Output in JSON format").flag(); @@ -1407,7 +1408,7 @@ int main(int argc, char** argv) { .default_value(static_cast(10)); program.add_argument("--no-auto-index") - .help("Disable automatic index building for files missing .idx") + .help("Disable automatic index building for files missing .dftindex") .flag(); program.add_argument("--checkpoint-size") diff --git a/src/dftracer/utils/binaries/dftracer_tar.cpp b/src/dftracer/utils/binaries/dftracer_tar.cpp index 7209b67a..d3ca22ba 100644 --- a/src/dftracer/utils/binaries/dftracer_tar.cpp +++ b/src/dftracer/utils/binaries/dftracer_tar.cpp @@ -36,8 +36,8 @@ static coro::CoroTask run_tar(const std::string& archive_path, DFTRACER_UTILS_LOG_INFO("Detected format: %s", indexer->get_format_name()); - DFTRACER_UTILS_LOG_INFO("Index file: %s", - indexer->get_idx_path().c_str()); + DFTRACER_UTILS_LOG_INFO("Index store: %s", + indexer->get_index_path().c_str()); // Build index if needed if (force_rebuild || indexer->need_rebuild()) { @@ -57,7 +57,7 @@ static coro::CoroTask run_tar(const std::string& archive_path, printf("Archive Information:\n"); printf(" Format: %s\n", indexer->get_format_name()); printf(" Path: %s\n", indexer->get_archive_path().c_str()); - printf(" Index: %s\n", indexer->get_idx_path().c_str()); + printf(" Index Store: %s\n", indexer->get_index_path().c_str()); printf(" Total size: %" PRIu64 " bytes\n", static_cast(indexer->get_max_bytes())); printf(" Total lines: %" PRIu64 "\n", indexer->get_num_lines()); @@ -106,7 +106,7 @@ int main(int argc, char** argv) { "DFTracer utility for indexing and analyzing TAR.GZ archives"); program.add_argument("file").help("TAR.GZ file to process").required(); program.add_argument("-i", "--index") - .help("Index file to use (auto-generated if not specified)") + .help("Path to the .dftindex store to use (auto-generated if omitted)") .default_value(""); program.add_argument("-c", "--checkpoint-size") .help("Checkpoint size for indexing in bytes") @@ -114,7 +114,7 @@ int main(int argc, char** argv) { .default_value( static_cast(Indexer::DEFAULT_CHECKPOINT_SIZE)); program.add_argument("-f", "--force-rebuild") - .help("Force rebuild index") + .help("Force rebuild the .dftindex store") .flag(); program.add_argument("--list-files") .help("List all files in the TAR archive") diff --git a/src/dftracer/utils/binaries/dftracer_view.cpp b/src/dftracer/utils/binaries/dftracer_view.cpp index 61bc5e2c..49fcb15f 100644 --- a/src/dftracer/utils/binaries/dftracer_view.cpp +++ b/src/dftracer/utils/binaries/dftracer_view.cpp @@ -75,11 +75,11 @@ static coro::CoroTask index_single_file(const std::string& file_path, } static coro::CoroTask read_single_chunk( - const std::string& file_path, const std::string& idx_path, + const std::string& file_path, const std::string& index_path, const ViewChunkCandidate& candidate, const ViewContext& vctx, CoroScope&) { ViewReaderInput reader_input; reader_input.with_file_path(file_path) - .with_idx_path(idx_path) + .with_index_path(index_path) .with_checkpoint_size(vctx.checkpoint_size) .with_byte_range(candidate.start_byte, candidate.end_byte) .with_checkpoint_idx(candidate.checkpoint_idx) @@ -117,14 +117,14 @@ static coro::CoroTask read_single_chunk( static coro::CoroTask process_single_file(const std::string& file_path, const ViewContext& vctx, CoroScope& fctx) { - std::string idx_path = + std::string index_path = internal::determine_index_path(file_path, vctx.index_dir); // Collect metadata auto meta_input = MetadataCollectorUtilityInput::from_file(file_path) .with_checkpoint_size(vctx.checkpoint_size) .with_force_rebuild(false) - .with_index(idx_path); + .with_index(index_path); auto metadata = co_await MetadataCollectorUtility{}.process(meta_input); if (!metadata.success) { @@ -138,7 +138,7 @@ static coro::CoroTask process_single_file(const std::string& file_path, ViewBuilderInput builder_input; builder_input.with_view(vctx.view) .with_file_path(file_path) - .with_idx_path(fs::exists(idx_path) ? idx_path : "") + .with_index_path(fs::exists(index_path) ? index_path : "") .with_uncompressed_size(metadata.uncompressed_size) .with_num_checkpoints(metadata.num_checkpoints); @@ -159,13 +159,13 @@ static coro::CoroTask process_single_file(const std::string& file_path, // Process each candidate chunk auto& candidates = build_output.candidates; - co_await fctx.scope([&file_path, &idx_path, &vctx, &candidates]( + co_await fctx.scope([&file_path, &index_path, &vctx, &candidates]( CoroScope& chunk_scope) -> coro::CoroTask { for (const auto& candidate : candidates) { - chunk_scope.spawn([&file_path, &idx_path, &candidate, + chunk_scope.spawn([&file_path, &index_path, &candidate, &vctx](CoroScope& cctx) -> coro::CoroTask { - co_await read_single_chunk(file_path, idx_path, candidate, vctx, - cctx); + co_await read_single_chunk(file_path, index_path, candidate, + vctx, cctx); }); } co_return; @@ -331,9 +331,9 @@ static coro::CoroTask run_view(argparse::ArgumentParser& program) { std::vector files_needing_index; for (const auto& file_path : files) { - std::string idx_path = + std::string index_path = internal::determine_index_path(file_path, index_dir); - if (!fs::exists(idx_path)) { + if (!fs::exists(index_path)) { files_needing_index.push_back(file_path); } } @@ -341,7 +341,8 @@ static coro::CoroTask run_view(argparse::ArgumentParser& program) { if (!files_needing_index.empty()) { if (no_auto_index) { DFTRACER_UTILS_LOG_ERROR( - "Missing .idx index for %zu file(s) and --no-auto-index is " + "Missing .dftindex store for %zu file(s) and --no-auto-index " + "is " "set. Run dftracer_index first.", files_needing_index.size()); for (const auto& f : files_needing_index) { @@ -539,11 +540,11 @@ int main(int argc, char** argv) { // Indexing options program.add_argument("--index-dir") - .help("Directory where .idx index files are stored") + .help("Directory where .dftindex stores are created") .default_value(""); program.add_argument("--no-auto-index") - .help("Disable automatic index building for files missing .idx") + .help("Disable automatic index building for files missing .dftindex") .flag(); program.add_argument("--checkpoint-size") diff --git a/src/dftracer/utils/core/env.cpp b/src/dftracer/utils/core/env.cpp new file mode 100644 index 00000000..703751f3 --- /dev/null +++ b/src/dftracer/utils/core/env.cpp @@ -0,0 +1,57 @@ +#include + +#include +#include +#include + +namespace dftracer::utils { + +template <> +std::optional Env::get( + std::string_view name) { + std::string key(name); + const char* value = std::getenv(key.c_str()); + if (value == nullptr || value[0] == '\0') { + return std::nullopt; + } + return std::string_view(value); +} + +template <> +std::optional Env::get(std::string_view name) { + auto value = get(name); + if (!value.has_value()) { + return std::nullopt; + } + + int parsed = 0; + auto* begin = value->data(); + auto* end = begin + value->size(); + auto [ptr, ec] = std::from_chars(begin, end, parsed); + if (ec != std::errc{} || ptr != end) { + return std::nullopt; + } + return parsed; +} + +int Env::rocksdb_max_open_files() { + static const int cached_value = [] { + constexpr int default_max_open_files = 32; + constexpr std::string_view env_name = + "DFTRACER_UTILS_ROCKSDB_MAX_OPEN_FILES"; + + auto configured = get(env_name); + if (!configured.has_value()) { + return default_max_open_files; + } + + if (*configured == -1 || *configured > 0) { + return *configured; + } + return default_max_open_files; + }(); + + return cached_value; +} + +} // namespace dftracer::utils diff --git a/src/dftracer/utils/core/io/epoll_thread_pool_backend.cpp b/src/dftracer/utils/core/io/epoll_thread_pool_backend.cpp index 3a86ebc8..1f733957 100644 --- a/src/dftracer/utils/core/io/epoll_thread_pool_backend.cpp +++ b/src/dftracer/utils/core/io/epoll_thread_pool_backend.cpp @@ -161,6 +161,23 @@ IoAwaitable EpollThreadPoolBackend::submit_pread(int fd, void* buf, &executor_, &pool_); } +void EpollThreadPoolBackend::submit_pread_callback(int fd, void* buf, + std::size_t len, + off_t offset, + IoCompletionFn completion, + void* context) { + auto* req = new IoRequest{}; + req->op = IoOp::PREAD; + req->fd = fd; + req->buf = buf; + req->len = len; + req->offset = offset; + req->completion = completion; + req->completion_ctx = context; + req->pool = &pool_; + pool_.submit([req] { execute_request(req); }); +} + IoAwaitable EpollThreadPoolBackend::submit_pwrite(int fd, const void* buf, std::size_t len, off_t offset) { @@ -364,8 +381,12 @@ void EpollThreadPoolBackend::execute_request(IoRequest* req) { } if (result < 0) result = -errno; - req->awaitable->result_ = result; - req->executor->enqueue(req->awaitable->handle_); + if (req->awaitable != nullptr) { + req->awaitable->result_ = result; + req->executor->enqueue(req->awaitable->handle_); + } else if (req->completion != nullptr) { + req->completion(req->completion_ctx, result); + } delete req; } diff --git a/src/dftracer/utils/core/io/epoll_thread_pool_backend.h b/src/dftracer/utils/core/io/epoll_thread_pool_backend.h index 11cb2db4..cd9b50dd 100644 --- a/src/dftracer/utils/core/io/epoll_thread_pool_backend.h +++ b/src/dftracer/utils/core/io/epoll_thread_pool_backend.h @@ -37,6 +37,9 @@ class EpollThreadPoolBackend : public IoBackend { IoAwaitable submit_write(int fd, const void* buf, std::size_t len) override; IoAwaitable submit_pread(int fd, void* buf, std::size_t len, off_t offset) override; + void submit_pread_callback(int fd, void* buf, std::size_t len, off_t offset, + IoCompletionFn completion, + void* context) override; IoAwaitable submit_pwrite(int fd, const void* buf, std::size_t len, off_t offset) override; IoAwaitable submit_open(const char* path, int flags, mode_t mode) override; diff --git a/src/dftracer/utils/core/io/io_backend_sync.cpp b/src/dftracer/utils/core/io/io_backend_sync.cpp index 3dc32d88..23662bf8 100644 --- a/src/dftracer/utils/core/io/io_backend_sync.cpp +++ b/src/dftracer/utils/core/io/io_backend_sync.cpp @@ -9,8 +9,8 @@ namespace dftracer::utils::io { ssize_t IoBackend::submit_read_sync(int fd, void *buf, std::size_t len, off_t offset) { // For sync wrappers, we cannot use the normal IoAwaitable coroutine - // path. Instead, we directly call the POSIX syscall. The VFS runs - // on a dedicated SQLite thread (not an executor worker), so blocking + // path. Instead, we directly call the POSIX syscall. This synchronous + // path is used from dedicated blocking contexts, so blocking // is acceptable. ssize_t result = ::pread(fd, buf, len, offset); if (result < 0) result = -errno; diff --git a/src/dftracer/utils/core/io/io_uring_backend.cpp b/src/dftracer/utils/core/io/io_uring_backend.cpp index 70670172..f4c67bf1 100644 --- a/src/dftracer/utils/core/io/io_uring_backend.cpp +++ b/src/dftracer/utils/core/io/io_uring_backend.cpp @@ -266,6 +266,8 @@ void IoUringBackend::completion_loop() { if (req->awaitable) { req->awaitable->result_ = cqe->res; executor_.enqueue(req->awaitable->handle_); + } else if (req->completion != nullptr) { + req->completion(req->completion_ctx, cqe->res); } delete req; } @@ -305,26 +307,44 @@ void IoUringBackend::submit_fn(SubmitContext* ctx, IoAwaitable* awaitable) { case IoUringSubmitCtx::Op::FTRUNCATE: { ssize_t sync_result = ::ftruncate(uring_ctx->fd, uring_ctx->offset); if (sync_result < 0) sync_result = -errno; - awaitable->result_ = sync_result; + if (awaitable != nullptr) { + awaitable->result_ = sync_result; + } else if (uring_ctx->completion != nullptr) { + uring_ctx->completion(uring_ctx->completion_ctx, sync_result); + } delete uring_ctx; - backend->executor_.enqueue(awaitable->handle_); + if (awaitable != nullptr) { + backend->executor_.enqueue(awaitable->handle_); + } return; } case IoUringSubmitCtx::Op::FSTAT: { ssize_t sync_result = ::fstat(uring_ctx->fd, uring_ctx->stat_buf); if (sync_result < 0) sync_result = -errno; - awaitable->result_ = sync_result; + if (awaitable != nullptr) { + awaitable->result_ = sync_result; + } else if (uring_ctx->completion != nullptr) { + uring_ctx->completion(uring_ctx->completion_ctx, sync_result); + } delete uring_ctx; - backend->executor_.enqueue(awaitable->handle_); + if (awaitable != nullptr) { + backend->executor_.enqueue(awaitable->handle_); + } return; } case IoUringSubmitCtx::Op::LSEEK: { ssize_t sync_result = ::lseek(uring_ctx->fd, uring_ctx->offset, uring_ctx->whence); if (sync_result < 0) sync_result = -errno; - awaitable->result_ = sync_result; + if (awaitable != nullptr) { + awaitable->result_ = sync_result; + } else if (uring_ctx->completion != nullptr) { + uring_ctx->completion(uring_ctx->completion_ctx, sync_result); + } delete uring_ctx; - backend->executor_.enqueue(awaitable->handle_); + if (awaitable != nullptr) { + backend->executor_.enqueue(awaitable->handle_); + } return; } case IoUringSubmitCtx::Op::SENDFILE: { @@ -332,9 +352,15 @@ void IoUringBackend::submit_fn(SubmitContext* ctx, IoAwaitable* awaitable) { ssize_t sync_result = ::sendfile(uring_ctx->dest_fd, uring_ctx->fd, &off, uring_ctx->len); if (sync_result < 0) sync_result = -errno; - awaitable->result_ = sync_result; + if (awaitable != nullptr) { + awaitable->result_ = sync_result; + } else if (uring_ctx->completion != nullptr) { + uring_ctx->completion(uring_ctx->completion_ctx, sync_result); + } delete uring_ctx; - backend->executor_.enqueue(awaitable->handle_); + if (awaitable != nullptr) { + backend->executor_.enqueue(awaitable->handle_); + } return; } default: @@ -344,6 +370,8 @@ void IoUringBackend::submit_fn(SubmitContext* ctx, IoAwaitable* awaitable) { // Create the request object that will be stored in SQE user_data auto* req = new IoUringRequest{}; req->awaitable = awaitable; + req->completion = uring_ctx->completion; + req->completion_ctx = uring_ctx->completion_ctx; std::lock_guard lock(backend->submit_mutex_); @@ -412,10 +440,16 @@ void IoUringBackend::submit_fn(SubmitContext* ctx, IoAwaitable* awaitable) { break; } if (result < 0) result = -errno; - awaitable->result_ = result; + if (awaitable != nullptr) { + awaitable->result_ = result; + } else if (uring_ctx->completion != nullptr) { + uring_ctx->completion(uring_ctx->completion_ctx, result); + } delete req; delete uring_ctx; - backend->executor_.enqueue(awaitable->handle_); + if (awaitable != nullptr) { + backend->executor_.enqueue(awaitable->handle_); + } return; } @@ -538,6 +572,18 @@ IoAwaitable IoUringBackend::submit_pread(int fd, void* buf, std::size_t len, nullptr, 0, 0, this); } +void IoUringBackend::submit_pread_callback(int fd, void* buf, std::size_t len, + off_t offset, + IoCompletionFn completion, + void* context) { + auto awaitable = make_uring_request(IoUringSubmitCtx::Op::PREAD, fd, buf, + len, offset, nullptr, 0, 0, this); + auto* uring_ctx = static_cast(awaitable.submit_ctx_); + uring_ctx->completion = completion; + uring_ctx->completion_ctx = context; + submit_fn(uring_ctx, nullptr); +} + IoAwaitable IoUringBackend::submit_pwrite(int fd, const void* buf, std::size_t len, off_t offset) { return make_uring_request(IoUringSubmitCtx::Op::PWRITE, fd, diff --git a/src/dftracer/utils/core/io/io_uring_backend.h b/src/dftracer/utils/core/io/io_uring_backend.h index 3c8e5d3f..fc156c84 100644 --- a/src/dftracer/utils/core/io/io_uring_backend.h +++ b/src/dftracer/utils/core/io/io_uring_backend.h @@ -31,6 +31,8 @@ struct IoUringRequest { } IoAwaitable* awaitable = nullptr; + IoCompletionFn completion = nullptr; + void* completion_ctx = nullptr; }; /// io_uring I/O backend using raw syscalls (no liburing dependency). @@ -52,6 +54,9 @@ class IoUringBackend : public IoBackend { IoAwaitable submit_write(int fd, const void* buf, std::size_t len) override; IoAwaitable submit_pread(int fd, void* buf, std::size_t len, off_t offset) override; + void submit_pread_callback(int fd, void* buf, std::size_t len, off_t offset, + IoCompletionFn completion, + void* context) override; IoAwaitable submit_pwrite(int fd, const void* buf, std::size_t len, off_t offset) override; IoAwaitable submit_open(const char* path, int flags, mode_t mode) override; @@ -147,6 +152,8 @@ struct IoUringSubmitCtx : SubmitContext { int whence = 0; int dest_fd = -1; IoUringBackend* backend = nullptr; + IoCompletionFn completion = nullptr; + void* completion_ctx = nullptr; }; } // namespace dftracer::utils::io diff --git a/src/dftracer/utils/core/io/kqueue_thread_pool_backend.cpp b/src/dftracer/utils/core/io/kqueue_thread_pool_backend.cpp index 195810da..811a4afc 100644 --- a/src/dftracer/utils/core/io/kqueue_thread_pool_backend.cpp +++ b/src/dftracer/utils/core/io/kqueue_thread_pool_backend.cpp @@ -147,6 +147,23 @@ IoAwaitable KqueueThreadPoolBackend::submit_pread(int fd, void* buf, &executor_, &pool_); } +void KqueueThreadPoolBackend::submit_pread_callback(int fd, void* buf, + std::size_t len, + off_t offset, + IoCompletionFn completion, + void* context) { + auto* req = new IoRequest{}; + req->op = IoOp::PREAD; + req->fd = fd; + req->buf = buf; + req->len = len; + req->offset = offset; + req->completion = completion; + req->completion_ctx = context; + req->pool = &pool_; + pool_.submit([req] { execute_request(req); }); +} + IoAwaitable KqueueThreadPoolBackend::submit_pwrite(int fd, const void* buf, std::size_t len, off_t offset) { @@ -383,8 +400,12 @@ void KqueueThreadPoolBackend::execute_request(IoRequest* req) { } if (result < 0) result = -errno; - req->awaitable->result_ = result; - req->executor->enqueue(req->awaitable->handle_); + if (req->awaitable != nullptr) { + req->awaitable->result_ = result; + req->executor->enqueue(req->awaitable->handle_); + } else if (req->completion != nullptr) { + req->completion(req->completion_ctx, result); + } delete req; } diff --git a/src/dftracer/utils/core/io/kqueue_thread_pool_backend.h b/src/dftracer/utils/core/io/kqueue_thread_pool_backend.h index 78bf2933..34aae394 100644 --- a/src/dftracer/utils/core/io/kqueue_thread_pool_backend.h +++ b/src/dftracer/utils/core/io/kqueue_thread_pool_backend.h @@ -39,6 +39,9 @@ class KqueueThreadPoolBackend : public IoBackend { IoAwaitable submit_write(int fd, const void* buf, std::size_t len) override; IoAwaitable submit_pread(int fd, void* buf, std::size_t len, off_t offset) override; + void submit_pread_callback(int fd, void* buf, std::size_t len, off_t offset, + IoCompletionFn completion, + void* context) override; IoAwaitable submit_pwrite(int fd, const void* buf, std::size_t len, off_t offset) override; IoAwaitable submit_open(const char* path, int flags, mode_t mode) override; diff --git a/src/dftracer/utils/core/io/thread_pool_backend.cpp b/src/dftracer/utils/core/io/thread_pool_backend.cpp index b1a23b49..7a7df728 100644 --- a/src/dftracer/utils/core/io/thread_pool_backend.cpp +++ b/src/dftracer/utils/core/io/thread_pool_backend.cpp @@ -63,6 +63,22 @@ IoAwaitable ThreadPoolBackend::submit_pread(int fd, void* buf, std::size_t len, &executor_, &pool_); } +void ThreadPoolBackend::submit_pread_callback(int fd, void* buf, + std::size_t len, off_t offset, + IoCompletionFn completion, + void* context) { + auto* req = new IoRequest{}; + req->op = IoOp::PREAD; + req->fd = fd; + req->buf = buf; + req->len = len; + req->offset = offset; + req->completion = completion; + req->completion_ctx = context; + req->pool = &pool_; + pool_.submit([req] { execute_request(req); }); +} + IoAwaitable ThreadPoolBackend::submit_pwrite(int fd, const void* buf, std::size_t len, off_t offset) { return make_request(IoOp::PWRITE, fd, const_cast(buf), len, offset, @@ -300,8 +316,12 @@ void ThreadPoolBackend::execute_request(IoRequest* req) { } if (result < 0) result = -errno; - req->awaitable->result_ = result; - req->executor->enqueue(req->awaitable->handle_); + if (req->awaitable != nullptr) { + req->awaitable->result_ = result; + req->executor->enqueue(req->awaitable->handle_); + } else if (req->completion != nullptr) { + req->completion(req->completion_ctx, result); + } delete req; } @@ -313,4 +333,4 @@ int ThreadPoolBackend::flush() { return static_cast(pool_.flush()); } std::string ThreadPoolBackend::name() const { return "threadpool"; } -} // namespace dftracer::utils::io \ No newline at end of file +} // namespace dftracer::utils::io diff --git a/src/dftracer/utils/core/io/thread_pool_backend.h b/src/dftracer/utils/core/io/thread_pool_backend.h index 6d548069..cce372e2 100644 --- a/src/dftracer/utils/core/io/thread_pool_backend.h +++ b/src/dftracer/utils/core/io/thread_pool_backend.h @@ -65,6 +65,8 @@ struct IoRequest : SubmitContext { int whence = 0; int dest_fd = -1; IoAwaitable* awaitable = nullptr; + IoCompletionFn completion = nullptr; + void* completion_ctx = nullptr; Executor* executor = nullptr; IoThreadPool* pool = nullptr; }; @@ -83,6 +85,9 @@ class ThreadPoolBackend : public IoBackend { IoAwaitable submit_write(int fd, const void* buf, std::size_t len) override; IoAwaitable submit_pread(int fd, void* buf, std::size_t len, off_t offset) override; + void submit_pread_callback(int fd, void* buf, std::size_t len, off_t offset, + IoCompletionFn completion, + void* context) override; IoAwaitable submit_pwrite(int fd, const void* buf, std::size_t len, off_t offset) override; IoAwaitable submit_open(const char* path, int flags, mode_t mode) override; diff --git a/src/dftracer/utils/core/pipeline/executor.cpp b/src/dftracer/utils/core/pipeline/executor.cpp index 9eb68422..12029f51 100644 --- a/src/dftracer/utils/core/pipeline/executor.cpp +++ b/src/dftracer/utils/core/pipeline/executor.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include @@ -33,9 +32,7 @@ Executor* Executor::set_current(Executor* e) noexcept { return old; } -io::IoThreadPool* Executor::sqlite_pool() noexcept { - return sqlite_pool_.get(); -} +io::IoThreadPool* Executor::db_pool() noexcept { return db_pool_.get(); } // Thread-local list of coroutine handles to destroy after the current // resume() returns. FinalAwaiter pushes here instead of the shared @@ -70,7 +67,7 @@ Executor::Executor(const ExecutorConfig& config) io_pool_size_(config.io_pool_size), io_backend_type_(config.io_backend_type), io_batch_threshold_(config.io_batch_threshold), - sqlite_pool_size_(config.sqlite_pool_size) { + db_pool_size_(config.db_pool_size) { if (num_threads_ == 0) { num_threads_ = 2; // Fallback if hardware_concurrency returns 0 } @@ -102,10 +99,9 @@ void Executor::start() { io_backend_ = io::create_io_backend(*this, io_pool_size_, io_backend_type_, io_batch_threshold_); io_backend_->start(); - sqlite::register_dftracer_sqlite_vfs(io_backend_.get(), this); - sqlite_pool_ = std::make_unique(sqlite_pool_size_); - sqlite_pool_->start(); + db_pool_ = std::make_unique(db_pool_size_); + db_pool_->start(); // Create all worker contexts first so workers_ is stable before any // worker thread can try to iterate/steal from it. @@ -147,13 +143,11 @@ void Executor::shutdown() { // completion thread may still call enqueue() -> wake_all_workers() // which accesses WorkerContext cv/mutex, so workers_ must remain // alive until the completion thread has exited. - if (sqlite_pool_) { - sqlite_pool_->stop(); - sqlite_pool_.reset(); + if (db_pool_) { + db_pool_->stop(); + db_pool_.reset(); } - sqlite::unregister_dftracer_sqlite_vfs(); - if (io_backend_) { io_backend_->stop(); io_backend_.reset(); @@ -242,6 +236,7 @@ void Executor::worker_thread(WorkerContext* context) { // thread. Safe: resume() has fully returned, so the frame // is suspended at final_suspend and no code references it. drain_thread_local_destroys(); + drain_destroy_queue(); } // No work available -- sleep until signaled. else { @@ -258,9 +253,11 @@ void Executor::worker_thread(WorkerContext* context) { if (io_backend_) { auto reaped = io_backend_->poll(0); if (reaped > 0) { + drain_destroy_queue(); continue; } } + drain_destroy_queue(); std::unique_lock lock(context->queue_mutex); context->cv.wait(lock, [this, observed_signal] { return !running_.load(std::memory_order_acquire) || diff --git a/src/dftracer/utils/core/pipeline/pipeline.cpp b/src/dftracer/utils/core/pipeline/pipeline.cpp index 5fa3ec2b..b546974d 100644 --- a/src/dftracer/utils/core/pipeline/pipeline.cpp +++ b/src/dftracer/utils/core/pipeline/pipeline.cpp @@ -20,7 +20,7 @@ Pipeline::Pipeline(const PipelineConfig& config) exec_cfg.io_pool_size = config.io_thread_count; exec_cfg.io_backend_type = config.io_backend_type; exec_cfg.io_batch_threshold = config.io_batch_threshold; - exec_cfg.sqlite_pool_size = config.sqlite_pool_size; + exec_cfg.db_pool_size = config.db_pool_size; std::unique_ptr watchdog; if (config.enable_watchdog) { diff --git a/src/dftracer/utils/core/rocksdb/async.cpp b/src/dftracer/utils/core/rocksdb/async.cpp new file mode 100644 index 00000000..6d3b016b --- /dev/null +++ b/src/dftracer/utils/core/rocksdb/async.cpp @@ -0,0 +1,32 @@ +#include +#include +#include + +namespace dftracer::utils::rocksdb { + +io::IoThreadPool* get_db_pool() { + auto* exec = Executor::current(); + if (exec == nullptr) { + return nullptr; + } + return exec->db_pool(); +} + +void db_async_submit(io::IoThreadPool* pool, std::function fn) { + pool->submit(std::move(fn)); +} + +void db_async_resume_on(void* executor, std::coroutine_handle<> h) { + auto* exec = static_cast(executor); + if (exec != nullptr) { + exec->enqueue(h); + } else { + h.resume(); + } +} + +void* get_current_executor_opaque() { + return static_cast(Executor::current()); +} + +} // namespace dftracer::utils::rocksdb diff --git a/src/dftracer/utils/core/rocksdb/database.cpp b/src/dftracer/utils/core/rocksdb/database.cpp new file mode 100644 index 00000000..1b227a67 --- /dev/null +++ b/src/dftracer/utils/core/rocksdb/database.cpp @@ -0,0 +1,275 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace dftracer::utils::rocksdb { + +namespace { + +std::atomic& process_exiting_flag() { + static std::atomic flag{false}; + return flag; +} + +const ::rocksdb::ReadOptions& read_options() { + static const ::rocksdb::ReadOptions options; + return options; +} + +const ::rocksdb::WriteOptions& write_options() { + static const ::rocksdb::WriteOptions options; + return options; +} + +void cleanup_failed_open(::rocksdb::DB*& db, + std::vector<::rocksdb::ColumnFamilyHandle*>& handles) { + if (db != nullptr) { + for (auto* handle : handles) { + if (handle != nullptr) { + db->DestroyColumnFamilyHandle(handle); + } + } + static_cast(db->Close()); + delete db; + db = nullptr; + } + handles.clear(); +} + +} // namespace + +void mark_process_exiting_for_rocksdb() { + process_exiting_flag().store(true, std::memory_order_relaxed); +} + +RocksDatabase::RocksDatabase() = default; + +RocksDatabase::RocksDatabase(const std::string& db_path, OpenMode open_mode) { + open(db_path, open_mode); +} + +RocksDatabase::~RocksDatabase() { close(); } + +RocksDatabase::RocksDatabase(RocksDatabase&& other) noexcept + : db_path_(std::move(other.db_path_)), + open_mode_(other.open_mode_), + file_system_(std::move(other.file_system_)), + env_(std::move(other.env_)), + db_(std::exchange(other.db_, nullptr)), + column_families_(std::move(other.column_families_)) {} + +RocksDatabase& RocksDatabase::operator=(RocksDatabase&& other) noexcept { + if (this != &other) { + close(); + db_path_ = std::move(other.db_path_); + open_mode_ = other.open_mode_; + file_system_ = std::move(other.file_system_); + env_ = std::move(other.env_); + db_ = std::exchange(other.db_, nullptr); + column_families_ = std::move(other.column_families_); + } + return *this; +} + +std::vector RocksDatabase::default_column_families() { + return {"default", "checkpoints", "metadata", "chunk_bloom", + "file_bloom", "chunk_stats", "dimensions", "chunk_dim_stats", + "manifest", "provenance", "archives", "tar_files"}; +} + +::rocksdb::Options RocksDatabase::default_options() { + ::rocksdb::Options options; + options.create_if_missing = true; + options.create_missing_column_families = true; + options.allow_concurrent_memtable_write = true; + options.enable_pipelined_write = true; + options.max_open_files = Env::rocksdb_max_open_files(); + return options; +} + +::rocksdb::ColumnFamilyOptions RocksDatabase::default_column_family_options() { + ::rocksdb::ColumnFamilyOptions options; + options.compression = ::rocksdb::kLZ4Compression; + options.bottommost_compression = ::rocksdb::kZlibCompression; + return options; +} + +bool RocksDatabase::open(const std::string& db_path, OpenMode open_mode) { + close(); + db_path_ = db_path; + open_mode_ = open_mode; + + std::error_code ec; + if (open_mode_ == OpenMode::ReadWrite) { + fs::create_directories(fs::path(db_path_), ec); + } + + auto db_options = default_options(); + if (open_mode_ == OpenMode::ReadOnly) { + db_options.create_if_missing = false; + db_options.create_missing_column_families = false; + } + file_system_ = make_dftracer_file_system(); + env_ = make_dftracer_env(file_system_); + db_options.env = env_.get(); + auto cf_options = default_column_family_options(); + + std::vector column_family_names; + auto list_status = ::rocksdb::DB::ListColumnFamilies(db_options, db_path_, + &column_family_names); + if (!list_status.ok()) { + if (open_mode_ == OpenMode::ReadOnly) { + throw std::runtime_error( + "Failed to list RocksDB column families at '" + db_path_ + + "': " + list_status.ToString()); + } + column_family_names = default_column_families(); + } else { + if (open_mode_ == OpenMode::ReadWrite) { + for (const auto& name : default_column_families()) { + if (std::find(column_family_names.begin(), + column_family_names.end(), + name) == column_family_names.end()) { + column_family_names.push_back(name); + } + } + } + } + + std::vector<::rocksdb::ColumnFamilyDescriptor> descriptors; + descriptors.reserve(column_family_names.size()); + for (const auto& name : column_family_names) { + descriptors.emplace_back(name, cf_options); + } + + std::vector<::rocksdb::ColumnFamilyHandle*> handles; + auto status = + open_mode_ == OpenMode::ReadOnly + ? ::rocksdb::DB::OpenForReadOnly(db_options, db_path_, descriptors, + &handles, &db_, false) + : ::rocksdb::DB::Open(db_options, db_path_, descriptors, &handles, + &db_); + if (!status.ok()) { + cleanup_failed_open(db_, handles); + throw std::runtime_error("Failed to open RocksDB at '" + db_path_ + + "': " + status.ToString()); + } + + column_families_.clear(); + for (std::size_t i = 0; i < descriptors.size(); ++i) { + column_families_.emplace(descriptors[i].name, handles[i]); + } + + return true; +} + +void RocksDatabase::close() { + if (db_ == nullptr) { + column_families_.clear(); + return; + } + + if (process_exiting_flag().load(std::memory_order_relaxed)) { + db_ = nullptr; + column_families_.clear(); + env_.reset(); + file_system_.reset(); + db_path_.clear(); + return; + } + + for (auto& entry : column_families_) { + if (entry.second != nullptr) { + db_->DestroyColumnFamilyHandle(entry.second); + entry.second = nullptr; + } + } + column_families_.clear(); + + auto* db = db_; + db_ = nullptr; + static_cast(db->Close()); + delete db; + env_.reset(); + file_system_.reset(); + db_path_.clear(); +} + +bool RocksDatabase::is_open() const noexcept { return db_ != nullptr; } + +bool RocksDatabase::is_read_only() const noexcept { + return open_mode_ == OpenMode::ReadOnly; +} + +const std::string& RocksDatabase::path() const noexcept { return db_path_; } + +::rocksdb::DB* RocksDatabase::get() const noexcept { return db_; } + +::rocksdb::ColumnFamilyHandle* RocksDatabase::column_family_handle( + std::string_view column_family) const { + const auto name = column_family.empty() ? std::string("default") + : std::string(column_family); + const auto it = column_families_.find(name); + if (it == column_families_.end() || it->second == nullptr) { + throw std::invalid_argument("Unknown RocksDB column family: " + name); + } + return it->second; +} + +::rocksdb::Status RocksDatabase::put(std::string_view key, + std::string_view value, + std::string_view column_family) { + return db_->Put(write_options(), column_family_handle(column_family), + ::rocksdb::Slice(key.data(), key.size()), + ::rocksdb::Slice(value.data(), value.size())); +} + +::rocksdb::Status RocksDatabase::get(std::string_view key, std::string* value, + std::string_view column_family) const { + return db_->Get(read_options(), column_family_handle(column_family), + ::rocksdb::Slice(key.data(), key.size()), value); +} + +::rocksdb::Status RocksDatabase::del(std::string_view key, + std::string_view column_family) { + return db_->Delete(write_options(), column_family_handle(column_family), + ::rocksdb::Slice(key.data(), key.size())); +} + +::rocksdb::Status RocksDatabase::put(Batch& batch, + std::string_view column_family, + std::string_view key, + std::string_view value) { + return batch.Put(column_family_handle(column_family), + ::rocksdb::Slice(key.data(), key.size()), + ::rocksdb::Slice(value.data(), value.size())); +} + +::rocksdb::Status RocksDatabase::del(Batch& batch, + std::string_view column_family, + std::string_view key) { + return batch.Delete(column_family_handle(column_family), + ::rocksdb::Slice(key.data(), key.size())); +} + +RocksDatabase::Batch RocksDatabase::begin_batch() const { return Batch(); } + +::rocksdb::Status RocksDatabase::commit_batch(Batch& batch) { + return db_->Write(write_options(), &batch); +} + +std::unique_ptr<::rocksdb::Iterator> RocksDatabase::new_iterator( + std::string_view column_family) const { + return std::unique_ptr<::rocksdb::Iterator>( + db_->NewIterator(read_options(), column_family_handle(column_family))); +} + +} // namespace dftracer::utils::rocksdb diff --git a/src/dftracer/utils/core/rocksdb/db_manager.cpp b/src/dftracer/utils/core/rocksdb/db_manager.cpp new file mode 100644 index 00000000..a9ae5c57 --- /dev/null +++ b/src/dftracer/utils/core/rocksdb/db_manager.cpp @@ -0,0 +1,143 @@ +#include + +#include + +namespace dftracer::utils::rocksdb { + +RocksDBManager& RocksDBManager::instance() { + static RocksDBManager manager; + return manager; +} + +std::shared_ptr RocksDBManager::get_or_open( + const std::string& db_path, RocksDatabase::OpenMode open_mode) { + for (;;) { + bool needs_upgrade = false; + bool do_open = false; + + { + std::unique_lock lock(mutex_); + + for (;;) { + if (auto it = databases_.find(db_path); + it != databases_.end()) { + auto current = it->second.lock(); + if (!current) { + databases_.erase(it); + continue; + } + if (!(current->is_read_only() && + open_mode == RocksDatabase::OpenMode::ReadWrite)) { + return current; + } + + if (opening_.contains(db_path)) { + cv_.wait(lock, + [&] { return !opening_.contains(db_path); }); + continue; + } + + if (current.use_count() != 1) { + throw std::runtime_error( + "Cannot upgrade RocksDB instance at '" + db_path + + "' from read-only to read-write while it is still " + "in use"); + } + + needs_upgrade = true; + opening_.insert(db_path); + do_open = true; + break; + } + + if (opening_.contains(db_path)) { + cv_.wait(lock, [&] { return !opening_.contains(db_path); }); + continue; + } + + opening_.insert(db_path); + do_open = true; + break; + } + } + + if (!do_open) { + continue; + } + + std::shared_ptr database; + try { + database = std::make_shared( + db_path, + needs_upgrade ? RocksDatabase::OpenMode::ReadWrite : open_mode); + } catch (...) { + std::lock_guard lock(mutex_); + opening_.erase(db_path); + cv_.notify_all(); + throw; + } + + { + std::lock_guard lock(mutex_); + auto it = databases_.find(db_path); + + if (it == databases_.end()) { + databases_[db_path] = database; + opening_.erase(db_path); + cv_.notify_all(); + return database; + } + + auto current = it->second.lock(); + if (!current) { + databases_[db_path] = database; + opening_.erase(db_path); + cv_.notify_all(); + return database; + } + + if (!(current->is_read_only() && + open_mode == RocksDatabase::OpenMode::ReadWrite)) { + opening_.erase(db_path); + cv_.notify_all(); + return current; + } + + if (current.use_count() != 1) { + opening_.erase(db_path); + cv_.notify_all(); + throw std::runtime_error( + "Cannot upgrade RocksDB instance at '" + db_path + + "' from read-only to read-write while it is still in use"); + } + + databases_[db_path] = database; + opening_.erase(db_path); + cv_.notify_all(); + return database; + } + } +} + +void RocksDBManager::reset(const std::string& db_path) { + std::unique_lock lock(mutex_); + + cv_.wait(lock, [&] { return !opening_.contains(db_path); }); + + auto it = databases_.find(db_path); + if (it == databases_.end()) { + return; + } + + databases_.erase(it); +} + +void RocksDBManager::shutdown() { + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [&] { return opening_.empty(); }); + databases_.clear(); + } +} + +} // namespace dftracer::utils::rocksdb diff --git a/src/dftracer/utils/core/rocksdb/filesystem.cpp b/src/dftracer/utils/core/rocksdb/filesystem.cpp new file mode 100644 index 00000000..1d31f791 --- /dev/null +++ b/src/dftracer/utils/core/rocksdb/filesystem.cpp @@ -0,0 +1,849 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftracer::utils::rocksdb { + +namespace { + +io::IoBackend* current_io_backend() { + auto* executor = Executor::current(); + if (executor == nullptr || !executor->has_io_backend()) { + return nullptr; + } + return &executor->io_backend(); +} + +class DfTracerFileSystem; + +struct AsyncReadHandle { + explicit AsyncReadHandle(DfTracerFileSystem* owner_) : owner(owner_) {} + + static void* operator new(std::size_t size) { + return ObjectPool::instance().allocate(size); + } + + static void operator delete(void* ptr, std::size_t size) noexcept { + ObjectPool::instance().deallocate(ptr, size); + } + + DfTracerFileSystem* owner; + std::mutex mutex; + std::condition_variable cv; + bool finished = false; + bool callback_delivered = false; + bool aborted = false; + bool running = false; + std::string path; + std::uint64_t offset = 0; + std::size_t len = 0; + char* scratch = nullptr; + ::rocksdb::Slice result; + ::rocksdb::IOStatus status; + std::function callback; + void* callback_arg = nullptr; +}; + +::rocksdb::IOStatus io_error(std::string_view op, std::string_view path) { + return ::rocksdb::IOStatus::IOError( + std::string(path), std::string(op) + ": " + std::strerror(errno)); +} + +ssize_t pread_sync(int fd, void* buf, std::size_t len, off_t offset) { + if (auto* backend = current_io_backend(); backend != nullptr) { + return backend->submit_read_sync(fd, buf, len, offset); + } + return ::pread(fd, buf, len, offset); +} + +ssize_t pwrite_sync(int fd, const void* buf, std::size_t len, off_t offset) { + if (auto* backend = current_io_backend(); backend != nullptr) { + return backend->submit_write_sync(fd, buf, len, offset); + } + return ::pwrite(fd, buf, len, offset); +} + +int fsync_sync(int fd) { + if (auto* backend = current_io_backend(); backend != nullptr) { + return backend->submit_fsync_sync(fd); + } + return ::fsync(fd); +} + +int ftruncate_sync(int fd, off_t length) { + if (auto* backend = current_io_backend(); backend != nullptr) { + return backend->submit_ftruncate_sync(fd, length); + } + return ::ftruncate(fd, length); +} + +int fstat_sync(int fd, struct stat* st) { + if (auto* backend = current_io_backend(); backend != nullptr) { + return backend->submit_fstat_sync(fd, st); + } + return ::fstat(fd, st); +} + +class DfTracerSequentialFile final : public ::rocksdb::FSSequentialFile { + public: + DfTracerSequentialFile(std::string path, int fd) + : path_(std::move(path)), fd_(fd) {} + + ~DfTracerSequentialFile() override { + if (fd_ >= 0) { + ::close(fd_); + } + } + + ::rocksdb::IOStatus Read(std::size_t n, const ::rocksdb::IOOptions&, + ::rocksdb::Slice* result, char* scratch, + ::rocksdb::IODebugContext*) override { + std::lock_guard lock(mutex_); + const ssize_t bytes = + pread_sync(fd_, scratch, n, static_cast(offset_)); + if (bytes < 0) { + return io_error("read", path_); + } + offset_ += static_cast(bytes); + *result = ::rocksdb::Slice(scratch, static_cast(bytes)); + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus Skip(std::uint64_t n) override { + std::lock_guard lock(mutex_); + offset_ += n; + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus InvalidateCache(std::size_t, std::size_t) override { + return ::rocksdb::IOStatus::OK(); + } + + private: + std::string path_; + int fd_; + std::uint64_t offset_ = 0; + std::mutex mutex_; +}; + +class DfTracerRandomAccessFile final : public ::rocksdb::FSRandomAccessFile { + public: + DfTracerRandomAccessFile(DfTracerFileSystem* owner, std::string path, + int fd) + : owner_(owner), path_(std::move(path)), fd_(fd) {} + + ~DfTracerRandomAccessFile() override { + if (fd_ >= 0) { + ::close(fd_); + } + } + + ::rocksdb::IOStatus Read(std::uint64_t offset, std::size_t n, + const ::rocksdb::IOOptions&, + ::rocksdb::Slice* result, char* scratch, + ::rocksdb::IODebugContext*) const override { + const ssize_t bytes = + pread_sync(fd_, scratch, n, static_cast(offset)); + if (bytes < 0) { + return io_error("pread", path_); + } + *result = ::rocksdb::Slice(scratch, static_cast(bytes)); + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus Prefetch(std::uint64_t, std::size_t, + const ::rocksdb::IOOptions&, + ::rocksdb::IODebugContext*) override { + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus ReadAsync( + ::rocksdb::FSReadRequest& req, const ::rocksdb::IOOptions& opts, + std::function cb, void* cb_arg, + void** io_handle, ::rocksdb::IOHandleDeleter* del_fn, + ::rocksdb::IODebugContext* dbg) override; + + ::rocksdb::IOStatus InvalidateCache(std::size_t, std::size_t) override { + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus GetFileSize(std::uint64_t* result) override { + struct stat st{}; + if (fstat_sync(fd_, &st) != 0) { + return io_error("fstat", path_); + } + *result = static_cast(st.st_size); + return ::rocksdb::IOStatus::OK(); + } + + private: + DfTracerFileSystem* owner_; + std::string path_; + int fd_; +}; + +class DfTracerWritableFile final : public ::rocksdb::FSWritableFile { + public: + using ::rocksdb::FSWritableFile::Append; + using ::rocksdb::FSWritableFile::PositionedAppend; + + DfTracerWritableFile(std::string path, int fd, + const ::rocksdb::FileOptions& options) + : ::rocksdb::FSWritableFile(options), path_(std::move(path)), fd_(fd) { + struct stat st{}; + if (fstat_sync(fd_, &st) == 0) { + size_ = static_cast(st.st_size); + } + } + + ~DfTracerWritableFile() override { + if (fd_ >= 0) { + static_cast(close_fd()); + } + } + + ::rocksdb::IOStatus Append(const ::rocksdb::Slice& data, + const ::rocksdb::IOOptions&, + ::rocksdb::IODebugContext*) override { + std::lock_guard lock(mutex_); + return write_at(data, size_); + } + + ::rocksdb::IOStatus PositionedAppend(const ::rocksdb::Slice& data, + std::uint64_t offset, + const ::rocksdb::IOOptions&, + ::rocksdb::IODebugContext*) override { + std::lock_guard lock(mutex_); + return write_at(data, offset); + } + + ::rocksdb::IOStatus Truncate(std::uint64_t size, + const ::rocksdb::IOOptions&, + ::rocksdb::IODebugContext*) override { + std::lock_guard lock(mutex_); + if (ftruncate_sync(fd_, static_cast(size)) != 0) { + return io_error("ftruncate", path_); + } + size_ = size; + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus Close(const ::rocksdb::IOOptions&, + ::rocksdb::IODebugContext*) override { + std::lock_guard lock(mutex_); + return close_fd(); + } + + ::rocksdb::IOStatus Flush(const ::rocksdb::IOOptions&, + ::rocksdb::IODebugContext*) override { + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus Sync(const ::rocksdb::IOOptions&, + ::rocksdb::IODebugContext*) override { + std::lock_guard lock(mutex_); + if (fd_ < 0) { + return ::rocksdb::IOStatus::OK(); + } + if (fsync_sync(fd_) != 0) { + return io_error("fsync", path_); + } + return ::rocksdb::IOStatus::OK(); + } + + bool IsSyncThreadSafe() const override { return true; } + + std::uint64_t GetFileSize(const ::rocksdb::IOOptions&, + ::rocksdb::IODebugContext*) override { + std::lock_guard lock(mutex_); + return size_; + } + + ::rocksdb::IOStatus InvalidateCache(std::size_t, std::size_t) override { + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus RangeSync(std::uint64_t, std::uint64_t, + const ::rocksdb::IOOptions& options, + ::rocksdb::IODebugContext* dbg) override { + return Sync(options, dbg); + } + + private: + ::rocksdb::IOStatus write_at(const ::rocksdb::Slice& data, + std::uint64_t offset) { + const ssize_t bytes = pwrite_sync(fd_, data.data(), data.size(), + static_cast(offset)); + if (bytes < 0 || static_cast(bytes) != data.size()) { + return io_error("pwrite", path_); + } + size_ = std::max(size_, offset + static_cast(bytes)); + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus close_fd() { + if (fd_ < 0) { + return ::rocksdb::IOStatus::OK(); + } + if (::close(fd_) != 0) { + return io_error("close", path_); + } + fd_ = -1; + return ::rocksdb::IOStatus::OK(); + } + + std::string path_; + int fd_; + std::uint64_t size_ = 0; + mutable std::mutex mutex_; +}; + +class LocalFileSystemWrapper : public ::rocksdb::FileSystem { + public: + explicit LocalFileSystemWrapper( + const std::shared_ptr<::rocksdb::FileSystem>& target) + : target_(target) {} + + ::rocksdb::FileSystem* target() const { return target_.get(); } + + ::rocksdb::IOStatus NewSequentialFile( + const std::string& f, const ::rocksdb::FileOptions& file_opts, + std::unique_ptr<::rocksdb::FSSequentialFile>* r, + ::rocksdb::IODebugContext* dbg) override { + return target_->NewSequentialFile(f, file_opts, r, dbg); + } + + ::rocksdb::IOStatus NewRandomAccessFile( + const std::string& f, const ::rocksdb::FileOptions& file_opts, + std::unique_ptr<::rocksdb::FSRandomAccessFile>* r, + ::rocksdb::IODebugContext* dbg) override { + return target_->NewRandomAccessFile(f, file_opts, r, dbg); + } + + ::rocksdb::IOStatus NewWritableFile( + const std::string& f, const ::rocksdb::FileOptions& file_opts, + std::unique_ptr<::rocksdb::FSWritableFile>* r, + ::rocksdb::IODebugContext* dbg) override { + return target_->NewWritableFile(f, file_opts, r, dbg); + } + + ::rocksdb::IOStatus ReopenWritableFile( + const std::string& fname, const ::rocksdb::FileOptions& file_opts, + std::unique_ptr<::rocksdb::FSWritableFile>* result, + ::rocksdb::IODebugContext* dbg) override { + return target_->ReopenWritableFile(fname, file_opts, result, dbg); + } + + ::rocksdb::IOStatus ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const ::rocksdb::FileOptions& file_opts, + std::unique_ptr<::rocksdb::FSWritableFile>* r, + ::rocksdb::IODebugContext* dbg) override { + return target_->ReuseWritableFile(fname, old_fname, file_opts, r, dbg); + } + + ::rocksdb::IOStatus NewRandomRWFile( + const std::string& fname, const ::rocksdb::FileOptions& file_opts, + std::unique_ptr<::rocksdb::FSRandomRWFile>* result, + ::rocksdb::IODebugContext* dbg) override { + return target_->NewRandomRWFile(fname, file_opts, result, dbg); + } + + ::rocksdb::IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr<::rocksdb::MemoryMappedFileBuffer>* result) override { + return target_->NewMemoryMappedFileBuffer(fname, result); + } + + ::rocksdb::IOStatus NewDirectory( + const std::string& name, const ::rocksdb::IOOptions& io_opts, + std::unique_ptr<::rocksdb::FSDirectory>* result, + ::rocksdb::IODebugContext* dbg) override { + return target_->NewDirectory(name, io_opts, result, dbg); + } + + ::rocksdb::IOStatus FileExists(const std::string& f, + const ::rocksdb::IOOptions& io_opts, + ::rocksdb::IODebugContext* dbg) override { + return target_->FileExists(f, io_opts, dbg); + } + + ::rocksdb::IOStatus GetChildren(const std::string& dir, + const ::rocksdb::IOOptions& io_opts, + std::vector* r, + ::rocksdb::IODebugContext* dbg) override { + return target_->GetChildren(dir, io_opts, r, dbg); + } + + ::rocksdb::IOStatus GetChildrenFileAttributes( + const std::string& dir, const ::rocksdb::IOOptions& options, + std::vector<::rocksdb::FileAttributes>* result, + ::rocksdb::IODebugContext* dbg) override { + return target_->GetChildrenFileAttributes(dir, options, result, dbg); + } + + ::rocksdb::IOStatus DeleteFile(const std::string& f, + const ::rocksdb::IOOptions& options, + ::rocksdb::IODebugContext* dbg) override { + return target_->DeleteFile(f, options, dbg); + } + + ::rocksdb::IOStatus Truncate(const std::string& fname, size_t size, + const ::rocksdb::IOOptions& options, + ::rocksdb::IODebugContext* dbg) override { + return target_->Truncate(fname, size, options, dbg); + } + + ::rocksdb::IOStatus CreateDir(const std::string& d, + const ::rocksdb::IOOptions& options, + ::rocksdb::IODebugContext* dbg) override { + return target_->CreateDir(d, options, dbg); + } + + ::rocksdb::IOStatus CreateDirIfMissing( + const std::string& d, const ::rocksdb::IOOptions& options, + ::rocksdb::IODebugContext* dbg) override { + return target_->CreateDirIfMissing(d, options, dbg); + } + + ::rocksdb::IOStatus DeleteDir(const std::string& d, + const ::rocksdb::IOOptions& options, + ::rocksdb::IODebugContext* dbg) override { + return target_->DeleteDir(d, options, dbg); + } + + ::rocksdb::IOStatus GetFileSize(const std::string& f, + const ::rocksdb::IOOptions& options, + uint64_t* s, + ::rocksdb::IODebugContext* dbg) override { + return target_->GetFileSize(f, options, s, dbg); + } + + ::rocksdb::IOStatus GetFileModificationTime( + const std::string& fname, const ::rocksdb::IOOptions& options, + uint64_t* file_mtime, ::rocksdb::IODebugContext* dbg) override { + return target_->GetFileModificationTime(fname, options, file_mtime, + dbg); + } + + ::rocksdb::IOStatus GetAbsolutePath( + const std::string& db_path, const ::rocksdb::IOOptions& options, + std::string* output_path, ::rocksdb::IODebugContext* dbg) override { + return target_->GetAbsolutePath(db_path, options, output_path, dbg); + } + + ::rocksdb::IOStatus RenameFile(const std::string& s, const std::string& t, + const ::rocksdb::IOOptions& options, + ::rocksdb::IODebugContext* dbg) override { + return target_->RenameFile(s, t, options, dbg); + } + + ::rocksdb::IOStatus LinkFile(const std::string& s, const std::string& t, + const ::rocksdb::IOOptions& options, + ::rocksdb::IODebugContext* dbg) override { + return target_->LinkFile(s, t, options, dbg); + } + + ::rocksdb::IOStatus NumFileLinks(const std::string& fname, + const ::rocksdb::IOOptions& options, + uint64_t* count, + ::rocksdb::IODebugContext* dbg) override { + return target_->NumFileLinks(fname, options, count, dbg); + } + + ::rocksdb::IOStatus AreFilesSame(const std::string& first, + const std::string& second, + const ::rocksdb::IOOptions& options, + bool* res, + ::rocksdb::IODebugContext* dbg) override { + return target_->AreFilesSame(first, second, options, res, dbg); + } + + ::rocksdb::IOStatus LockFile(const std::string& f, + const ::rocksdb::IOOptions& options, + ::rocksdb::FileLock** l, + ::rocksdb::IODebugContext* dbg) override { + return target_->LockFile(f, options, l, dbg); + } + + ::rocksdb::IOStatus UnlockFile(::rocksdb::FileLock* l, + const ::rocksdb::IOOptions& options, + ::rocksdb::IODebugContext* dbg) override { + return target_->UnlockFile(l, options, dbg); + } + + ::rocksdb::IOStatus GetTestDirectory( + const ::rocksdb::IOOptions& options, std::string* path, + ::rocksdb::IODebugContext* dbg) override { + return target_->GetTestDirectory(options, path, dbg); + } + + ::rocksdb::IOStatus NewLogger(const std::string& fname, + const ::rocksdb::IOOptions& options, + std::shared_ptr<::rocksdb::Logger>* result, + ::rocksdb::IODebugContext* dbg) override { + return target_->NewLogger(fname, options, result, dbg); + } + + void SanitizeFileOptions(::rocksdb::FileOptions* opts) const override { + target_->SanitizeFileOptions(opts); + } + + ::rocksdb::FileOptions OptimizeForLogRead( + const ::rocksdb::FileOptions& file_options) const override { + return target_->OptimizeForLogRead(file_options); + } + + ::rocksdb::FileOptions OptimizeForManifestRead( + const ::rocksdb::FileOptions& file_options) const override { + return target_->OptimizeForManifestRead(file_options); + } + + ::rocksdb::FileOptions OptimizeForLogWrite( + const ::rocksdb::FileOptions& file_options, + const ::rocksdb::DBOptions& db_options) const override { + return target_->OptimizeForLogWrite(file_options, db_options); + } + + ::rocksdb::FileOptions OptimizeForManifestWrite( + const ::rocksdb::FileOptions& file_options) const override { + return target_->OptimizeForManifestWrite(file_options); + } + + ::rocksdb::FileOptions OptimizeForCompactionTableWrite( + const ::rocksdb::FileOptions& file_options, + const ::rocksdb::ImmutableDBOptions& immutable_opts) const override { + return target_->OptimizeForCompactionTableWrite(file_options, + immutable_opts); + } + + ::rocksdb::FileOptions OptimizeForCompactionTableRead( + const ::rocksdb::FileOptions& file_options, + const ::rocksdb::ImmutableDBOptions& db_options) const override { + return target_->OptimizeForCompactionTableRead(file_options, + db_options); + } + + ::rocksdb::FileOptions OptimizeForBlobFileRead( + const ::rocksdb::FileOptions& file_options, + const ::rocksdb::ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } + + ::rocksdb::IOStatus GetFreeSpace(const std::string& path, + const ::rocksdb::IOOptions& options, + uint64_t* diskfree, + ::rocksdb::IODebugContext* dbg) override { + return target_->GetFreeSpace(path, options, diskfree, dbg); + } + + ::rocksdb::IOStatus IsDirectory(const std::string& path, + const ::rocksdb::IOOptions& options, + bool* is_dir, + ::rocksdb::IODebugContext* dbg) override { + return target_->IsDirectory(path, options, is_dir, dbg); + } + + const ::rocksdb::Customizable* Inner() const override { + return target_.get(); + } + + ::rocksdb::Status PrepareOptions( + const ::rocksdb::ConfigOptions& options) override { + return target_->PrepareOptions(options); + } + + std::string SerializeOptions(const ::rocksdb::ConfigOptions& config_options, + const std::string& header) const override { + return ::rocksdb::FileSystem::SerializeOptions(config_options, header); + } + + ::rocksdb::IOStatus Poll(std::vector& io_handles, + size_t min_completions) override { + return target_->Poll(io_handles, min_completions); + } + + ::rocksdb::IOStatus AbortIO(std::vector& io_handles) override { + return target_->AbortIO(io_handles); + } + + void DiscardCacheForDirectory(const std::string& path) override { + target_->DiscardCacheForDirectory(path); + } + + void SupportedOps(int64_t& supported_ops) override { + target_->SupportedOps(supported_ops); + } + + protected: + std::shared_ptr<::rocksdb::FileSystem> target_; +}; + +class DfTracerFileSystem final : public LocalFileSystemWrapper { + public: + explicit DfTracerFileSystem( + const std::shared_ptr<::rocksdb::FileSystem>& target) + : LocalFileSystemWrapper(target), fallback_pool_(4) { + fallback_pool_.start(); + } + + ~DfTracerFileSystem() override { fallback_pool_.stop(); } + + static const char* kClassName() { return "DfTracerFileSystem"; } + + const char* Name() const override { return kClassName(); } + + bool IsInstanceOf(const std::string& name) const override { + return name == kClassName() || + LocalFileSystemWrapper::IsInstanceOf(name); + } + + void SupportedOps(int64_t& supported_ops) override { + supported_ops = 0; + supported_ops |= (1 << ::rocksdb::FSSupportedOps::kAsyncIO); + supported_ops |= (1 << ::rocksdb::FSSupportedOps::kFSPrefetch); + } + + ::rocksdb::IOStatus NewSequentialFile( + const std::string& fname, const ::rocksdb::FileOptions&, + std::unique_ptr<::rocksdb::FSSequentialFile>* result, + ::rocksdb::IODebugContext*) override { + int fd = ::open(fname.c_str(), O_RDONLY | O_CLOEXEC); + if (fd < 0) { + return io_error("open", fname); + } + result->reset(new DfTracerSequentialFile(fname, fd)); + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus NewRandomAccessFile( + const std::string& fname, const ::rocksdb::FileOptions&, + std::unique_ptr<::rocksdb::FSRandomAccessFile>* result, + ::rocksdb::IODebugContext*) override { + int fd = ::open(fname.c_str(), O_RDONLY | O_CLOEXEC); + if (fd < 0) { + return io_error("open", fname); + } + result->reset(new DfTracerRandomAccessFile(this, fname, fd)); + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus NewWritableFile( + const std::string& fname, const ::rocksdb::FileOptions& file_opts, + std::unique_ptr<::rocksdb::FSWritableFile>* result, + ::rocksdb::IODebugContext*) override { + int fd = + ::open(fname.c_str(), O_CREAT | O_TRUNC | O_RDWR | O_CLOEXEC, 0644); + if (fd < 0) { + return io_error("open", fname); + } + result->reset(new DfTracerWritableFile(fname, fd, file_opts)); + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus ReopenWritableFile( + const std::string& fname, const ::rocksdb::FileOptions& file_opts, + std::unique_ptr<::rocksdb::FSWritableFile>* result, + ::rocksdb::IODebugContext*) override { + int fd = ::open(fname.c_str(), O_CREAT | O_RDWR | O_CLOEXEC, 0644); + if (fd < 0) { + return io_error("open", fname); + } + result->reset(new DfTracerWritableFile(fname, fd, file_opts)); + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const ::rocksdb::FileOptions& file_opts, + std::unique_ptr<::rocksdb::FSWritableFile>* result, + ::rocksdb::IODebugContext*) override { + ::unlink(fname.c_str()); + if (::rename(old_fname.c_str(), fname.c_str()) != 0) { + return io_error("rename", old_fname); + } + return ReopenWritableFile(fname, file_opts, result, nullptr); + } + + ::rocksdb::IOStatus Poll(std::vector& io_handles, + size_t min_completions) override { + const size_t target = std::min(min_completions, io_handles.size()); + if (target == 0) { + return ::rocksdb::IOStatus::OK(); + } + std::unique_lock lock(completions_mutex_); + completions_cv_.wait(lock, [&] { + size_t completed = 0; + for (void* io_handle : io_handles) { + auto* handle = static_cast(io_handle); + std::lock_guard handle_lock(handle->mutex); + if (handle->finished && !handle->callback_delivered) { + ++completed; + } + } + return completed >= target; + }); + lock.unlock(); + + for (void* io_handle : io_handles) { + auto* handle = static_cast(io_handle); + std::unique_lock handle_lock(handle->mutex); + if (!handle->finished || handle->callback_delivered || + handle->aborted) { + continue; + } + handle->callback_delivered = true; + auto callback = handle->callback; + auto callback_arg = handle->callback_arg; + ::rocksdb::FSReadRequest req; + req.offset = handle->offset; + req.len = handle->len; + req.scratch = handle->scratch; + req.result = handle->result; + req.status = handle->status; + handle_lock.unlock(); + callback(req, callback_arg); + } + return ::rocksdb::IOStatus::OK(); + } + + ::rocksdb::IOStatus AbortIO(std::vector& io_handles) override { + for (void* io_handle : io_handles) { + auto* handle = static_cast(io_handle); + std::lock_guard lock(handle->mutex); + handle->aborted = true; + } + + for (void* io_handle : io_handles) { + auto* handle = static_cast(io_handle); + std::unique_lock lock(handle->mutex); + handle->cv.wait(lock, [&] { return handle->finished; }); + handle->callback_delivered = true; + } + + return ::rocksdb::IOStatus::OK(); + } + + void submit_async_read(AsyncReadHandle* handle, int fd, std::string path, + ::rocksdb::IODebugContext* dbg) { + { + std::lock_guard lock(handle->mutex); + handle->running = true; + handle->path = path; + } + if (auto* backend = current_io_backend(); backend != nullptr) { + backend->submit_pread_callback(fd, handle->scratch, handle->len, + static_cast(handle->offset), + &DfTracerFileSystem::on_pread_done, + handle); + return; + } + + fallback_pool_.submit([this, handle, fd, path = std::move(path), dbg] { + ::rocksdb::Slice result; + auto status = read_async_impl(fd, path, handle->offset, handle->len, + &result, handle->scratch, dbg); + complete_async_read(handle, status, result); + }); + } + + static ::rocksdb::IOStatus read_async_impl( + int fd, std::string_view path, std::uint64_t offset, std::size_t n, + ::rocksdb::Slice* result, char* scratch, ::rocksdb::IODebugContext*) { + const ssize_t bytes = + ::pread(fd, scratch, n, static_cast(offset)); + if (bytes < 0) { + return io_error("pread", path); + } + *result = ::rocksdb::Slice(scratch, static_cast(bytes)); + return ::rocksdb::IOStatus::OK(); + } + + static void delete_async_read_handle(void* io_handle) { + delete static_cast(io_handle); + } + + private: + void complete_async_read(AsyncReadHandle* handle, + const ::rocksdb::IOStatus& status, + const ::rocksdb::Slice& result) { + { + std::lock_guard lock(handle->mutex); + handle->result = result; + handle->status = status; + handle->running = false; + handle->finished = true; + } + handle->cv.notify_all(); + + std::lock_guard lock(completions_mutex_); + completions_cv_.notify_all(); + } + + static void on_pread_done(void* context, ssize_t result) noexcept { + auto* handle = static_cast(context); + ::rocksdb::IOStatus status = ::rocksdb::IOStatus::OK(); + ::rocksdb::Slice slice; + if (result < 0) { + errno = static_cast(-result); + status = io_error("pread", handle->path); + } else { + slice = ::rocksdb::Slice(handle->scratch, + static_cast(result)); + } + handle->owner->complete_async_read(handle, status, slice); + } + + io::IoThreadPool fallback_pool_; + std::mutex completions_mutex_; + std::condition_variable completions_cv_; +}; + +::rocksdb::IOStatus DfTracerRandomAccessFile::ReadAsync( + ::rocksdb::FSReadRequest& req, const ::rocksdb::IOOptions&, + std::function cb, void* cb_arg, + void** io_handle, ::rocksdb::IOHandleDeleter* del_fn, + ::rocksdb::IODebugContext* dbg) { + auto* handle = new AsyncReadHandle(owner_); + handle->offset = req.offset; + handle->len = req.len; + handle->scratch = req.scratch; + handle->callback = std::move(cb); + handle->callback_arg = cb_arg; + *io_handle = static_cast(handle); + *del_fn = &DfTracerFileSystem::delete_async_read_handle; + owner_->submit_async_read(handle, fd_, path_, dbg); + return ::rocksdb::IOStatus::OK(); +} + +} // namespace + +std::shared_ptr<::rocksdb::FileSystem> make_dftracer_file_system() { + return std::make_shared( + ::rocksdb::FileSystem::Default()); +} + +std::unique_ptr<::rocksdb::Env> make_dftracer_env( + const std::shared_ptr<::rocksdb::FileSystem>& file_system) { + return ::rocksdb::NewCompositeEnv(file_system); +} + +} // namespace dftracer::utils::rocksdb diff --git a/src/dftracer/utils/core/rocksdb/key_codec.cpp b/src/dftracer/utils/core/rocksdb/key_codec.cpp new file mode 100644 index 00000000..15240cdd --- /dev/null +++ b/src/dftracer/utils/core/rocksdb/key_codec.cpp @@ -0,0 +1,88 @@ +#include + +#include + +namespace dftracer::utils::rocksdb { + +namespace { + +template +T decode_big_endian(std::string_view bytes) { + if (bytes.size() != sizeof(T)) { + throw std::invalid_argument( + "KeyCodec: invalid big-endian integer width"); + } + + T value = 0; + for (unsigned char byte : bytes) { + value = static_cast((value << 8U) | byte); + } + return value; +} + +} // namespace + +std::string KeyCodec::encode_be32(std::uint32_t value) { + std::string out; + out.reserve(sizeof(value)); + append_be32(out, value); + return out; +} + +std::string KeyCodec::encode_be64(std::uint64_t value) { + std::string out; + out.reserve(sizeof(value)); + append_be64(out, value); + return out; +} + +std::uint32_t KeyCodec::decode_be32(std::string_view bytes) { + return decode_big_endian(bytes); +} + +std::uint64_t KeyCodec::decode_be64(std::string_view bytes) { + return decode_big_endian(bytes); +} + +void KeyCodec::append_be32(std::string& out, std::uint32_t value) { + for (int shift = 24; shift >= 0; shift -= 8) { + out.push_back(static_cast((value >> shift) & 0xFFU)); + } +} + +void KeyCodec::append_be64(std::string& out, std::uint64_t value) { + for (int shift = 56; shift >= 0; shift -= 8) { + out.push_back(static_cast((value >> shift) & 0xFFU)); + } +} + +KeyBuilder& KeyBuilder::append_tag(std::string_view tag) { + key_.append(tag); + return *this; +} + +KeyBuilder& KeyBuilder::append_separator() { + key_.push_back('\0'); + return *this; +} + +KeyBuilder& KeyBuilder::append_string(std::string_view value) { + key_.append(value); + return *this; +} + +KeyBuilder& KeyBuilder::append_be32(std::uint32_t value) { + KeyCodec::append_be32(key_, value); + return *this; +} + +KeyBuilder& KeyBuilder::append_be64(std::uint64_t value) { + KeyCodec::append_be64(key_, value); + return *this; +} + +std::string KeyBuilder::build() const { return key_; } + +void KeyBuilder::clear() { key_.clear(); } + +} // namespace dftracer::utils::rocksdb diff --git a/src/dftracer/utils/core/runtime.cpp b/src/dftracer/utils/core/runtime.cpp index 8199f820..b09c113e 100644 --- a/src/dftracer/utils/core/runtime.cpp +++ b/src/dftracer/utils/core/runtime.cpp @@ -64,8 +64,12 @@ TaskHandle Runtime::submit(coro::CoroTask task, std::string name) { std::shared_ptr> task_id) -> coro::Coro { try { co_await std::move(t); + t = coro::CoroTask{ + std::coroutine_handle::promise_type>{}}; exec->mark_coro_completed(task_id->load(std::memory_order_acquire)); } catch (...) { + t = coro::CoroTask{ + std::coroutine_handle::promise_type>{}}; exec->mark_coro_completed(task_id->load(std::memory_order_acquire)); p->set_exception(std::current_exception()); co_return; diff --git a/src/dftracer/utils/core/sqlite/async.cpp b/src/dftracer/utils/core/sqlite/async.cpp deleted file mode 100644 index 04ba7a77..00000000 --- a/src/dftracer/utils/core/sqlite/async.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::sqlite { - -io::IoThreadPool *get_sqlite_pool() { - auto *exec = Executor::current(); - if (exec == nullptr) { - return nullptr; - } - return exec->sqlite_pool(); -} - -void sqlite_async_submit(io::IoThreadPool *pool, std::function fn) { - pool->submit(std::move(fn)); -} - -void sqlite_async_resume_on(void *executor, std::coroutine_handle<> h) { - auto *exec = static_cast(executor); - if (exec != nullptr) { - exec->enqueue(h); - } else { - h.resume(); - } -} - -void *get_current_executor_opaque() { - return static_cast(Executor::current()); -} - -} // namespace dftracer::utils::sqlite diff --git a/src/dftracer/utils/core/sqlite/database.cpp b/src/dftracer/utils/core/sqlite/database.cpp deleted file mode 100644 index 0e8d24a7..00000000 --- a/src/dftracer/utils/core/sqlite/database.cpp +++ /dev/null @@ -1,85 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::sqlite { - -SqliteDatabase::SqliteDatabase() : db_path_(""), db_(nullptr) {} - -SqliteDatabase::SqliteDatabase(const std::string &db_path) - : db_path_(db_path), db_(nullptr) { - open(db_path); -} - -SqliteDatabase::~SqliteDatabase() { close(); } - -SqliteDatabase::SqliteDatabase(SqliteDatabase &&other) noexcept - : db_path_(std::move(other.db_path_)), db_(other.db_) { - other.db_ = nullptr; -} - -SqliteDatabase &SqliteDatabase::operator=(SqliteDatabase &&other) noexcept { - if (this != &other) { - close(); - db_path_ = std::move(other.db_path_); - db_ = other.db_; - other.db_ = nullptr; - } - return *this; -} - -bool SqliteDatabase::open(const std::string &db_path) { - if (is_open()) { - close(); - } - - db_path_ = db_path; - - // Ensure parent directory exists (SQLite cannot create it) - std::error_code ec; - fs::create_directories(fs::path(db_path_).parent_path(), ec); - - if (sqlite3_open(db_path_.c_str(), &db_) != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::OPEN_ERROR, - "Failed to open database: " + std::string(sqlite3_errmsg(db_))); - } - return true; -} - -void SqliteDatabase::close() { - if (db_) { - sqlite3_close(db_); - db_ = nullptr; - } -} - -sqlite3 *SqliteDatabase::get() const { return db_; } - -bool SqliteDatabase::is_open() const { return db_ != nullptr; } - -bool SqliteDatabase::open_with_vfs(const std::string &db_path, - const char *vfs_name) { - if (is_open()) { - close(); - } - db_path_ = db_path; - - std::error_code ec; - fs::create_directories(fs::path(db_path_).parent_path(), ec); - - int rc = - sqlite3_open_v2(db_path_.c_str(), &db_, - SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, vfs_name); - if (rc != SQLITE_OK) { - throw SqliteError(SqliteError::Type::OPEN_ERROR, - "Failed to open database with VFS '" + - std::string(vfs_name) + - "': " + std::string(sqlite3_errmsg(db_))); - } - return true; -} - -} // namespace dftracer::utils::sqlite diff --git a/src/dftracer/utils/core/sqlite/error.cpp b/src/dftracer/utils/core/sqlite/error.cpp deleted file mode 100644 index 9945564e..00000000 --- a/src/dftracer/utils/core/sqlite/error.cpp +++ /dev/null @@ -1,25 +0,0 @@ -#include - -namespace dftracer::utils::sqlite { -std::string SqliteError::format_message(Type type, const std::string &message) { - const char *prefix = ""; - switch (type) { - case DATABASE_ERROR: - prefix = "SQLite database error"; - break; - case STATEMENT_ERROR: - prefix = "SQLite statement error"; - break; - case OPEN_ERROR: - prefix = "SQLite open error"; - break; - case VFS_ERROR: - prefix = "SQLite VFS error"; - break; - case UNKNOWN_ERROR: - prefix = "SQLite unknown error"; - break; - } - return std::string(prefix) + ": " + message; -} -} // namespace dftracer::utils::sqlite diff --git a/src/dftracer/utils/core/sqlite/statement.cpp b/src/dftracer/utils/core/sqlite/statement.cpp deleted file mode 100644 index b01c9b92..00000000 --- a/src/dftracer/utils/core/sqlite/statement.cpp +++ /dev/null @@ -1,175 +0,0 @@ -#include -#include -#include - -#include -#include - -namespace dftracer::utils::sqlite { - -SqliteStmt::SqliteStmt(const SqliteDatabase &db, const char *sql) { - sqlite3 *raw_db = db.get(); - if (sqlite3_prepare_v2(raw_db, sql, -1, &stmt_, nullptr) != SQLITE_OK) { - stmt_ = nullptr; - throw SqliteError(SqliteError::Type::STATEMENT_ERROR, - "Failed to prepare SQL statement: " + - std::string(sqlite3_errmsg(raw_db))); - } -} - -SqliteStmt::SqliteStmt(sqlite3 *db, const char *sql) { - if (sqlite3_prepare_v2(db, sql, -1, &stmt_, nullptr) != SQLITE_OK) { - stmt_ = nullptr; - throw SqliteError(SqliteError::Type::STATEMENT_ERROR, - "Failed to prepare SQL statement: " + - std::string(sqlite3_errmsg(db))); - } -} - -SqliteStmt::~SqliteStmt() { - if (stmt_) { - sqlite3_finalize(stmt_); - } -} - -SqliteStmt::operator sqlite3_stmt *() { return stmt_; } - -sqlite3_stmt *SqliteStmt::get() { return stmt_; } - -void SqliteStmt::reset() { sqlite3_reset(stmt_); } - -void SqliteStmt::bind_int(int index, int value) { - validate_parameter_index(index); - int rc = sqlite3_bind_int(stmt_, index, value); - if (rc != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Failed to bind int parameter at index " + std::to_string(index)); - } -} - -void SqliteStmt::bind_int64(int index, int64_t value) { - validate_parameter_index(index); - int rc = sqlite3_bind_int64(stmt_, index, value); - if (rc != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Failed to bind int64 parameter at index " + std::to_string(index)); - } -} - -void SqliteStmt::bind_double(int index, double value) { - validate_parameter_index(index); - int rc = sqlite3_bind_double(stmt_, index, value); - if (rc != SQLITE_OK) { - throw SqliteError(SqliteError::Type::STATEMENT_ERROR, - "Failed to bind double parameter at index " + - std::to_string(index)); - } -} - -void SqliteStmt::bind_text(int index, const std::string &text) { - validate_parameter_index(index); - int rc = - sqlite3_bind_text(stmt_, index, text.c_str(), - static_cast(text.length()), SQLITE_TRANSIENT); - if (rc != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Failed to bind text parameter at index " + std::to_string(index)); - } -} - -void SqliteStmt::bind_text(int index, std::string_view text) { - validate_parameter_index(index); - int rc = sqlite3_bind_text(stmt_, index, text.data(), - static_cast(text.size()), SQLITE_TRANSIENT); - if (rc != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Failed to bind text parameter at index " + std::to_string(index)); - } -} - -void SqliteStmt::bind_text(int index, const char *text, int length, - void (*destructor)(void *)) { - validate_parameter_index(index); - int rc = sqlite3_bind_text(stmt_, index, text, length, destructor); - if (rc != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Failed to bind text parameter at index " + std::to_string(index)); - } -} - -void SqliteStmt::bind_blob(int index, const void *blob, int length) { - validate_parameter_index(index); - int rc = sqlite3_bind_blob(stmt_, index, blob, length, SQLITE_TRANSIENT); - if (rc != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Failed to bind blob parameter at index " + std::to_string(index)); - } -} - -void SqliteStmt::bind_blob(int index, std::span data) { - bind_blob(index, data.data(), static_cast(data.size())); -} - -void SqliteStmt::bind_blob(int index, std::span data) { - bind_blob(index, data.data(), static_cast(data.size())); -} - -void SqliteStmt::bind_blob_static(int index, const void *blob, int length) { - validate_parameter_index(index); - int rc = sqlite3_bind_blob(stmt_, index, blob, length, SQLITE_STATIC); - if (rc != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Failed to bind blob parameter at index " + std::to_string(index)); - } -} - -void SqliteStmt::bind_text_static(int index, std::string_view text) { - validate_parameter_index(index); - int rc = sqlite3_bind_text(stmt_, index, text.data(), - static_cast(text.size()), SQLITE_STATIC); - if (rc != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Failed to bind text parameter at index " + std::to_string(index)); - } -} - -void SqliteStmt::bind_null(int index) { - validate_parameter_index(index); - int rc = sqlite3_bind_null(stmt_, index); - if (rc != SQLITE_OK) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Failed to bind null parameter at index " + std::to_string(index)); - } -} - -void SqliteStmt::clear_bindings() { sqlite3_clear_bindings(stmt_); } - -int SqliteStmt::bind_parameter_count() { - return sqlite3_bind_parameter_count(stmt_); -} - -void SqliteStmt::validate_parameter_index(int index) { - if (index < 1) { - throw SqliteError( - SqliteError::Type::STATEMENT_ERROR, - "Parameter index must be >= 1 (got " + std::to_string(index) + ")"); - } - int param_count = sqlite3_bind_parameter_count(stmt_); - if (index > param_count) { - throw SqliteError(SqliteError::Type::STATEMENT_ERROR, - "Parameter index " + std::to_string(index) + - " exceeds parameter count " + - std::to_string(param_count)); - } -} - -} // namespace dftracer::utils::sqlite diff --git a/src/dftracer/utils/core/sqlite/vfs.cpp b/src/dftracer/utils/core/sqlite/vfs.cpp deleted file mode 100644 index 40fc122d..00000000 --- a/src/dftracer/utils/core/sqlite/vfs.cpp +++ /dev/null @@ -1,620 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace dftracer::utils::sqlite { - -// Forward declarations of all VFS methods -static int dftracer_sqlite_vfs_open(sqlite3_vfs *pVfs, const char *zName, - sqlite3_file *pFile, int flags, - int *pOutFlags); -static int dftracer_sqlite_vfs_delete(sqlite3_vfs *pVfs, const char *zPath, - int dirSync); -static int dftracer_sqlite_vfs_access(sqlite3_vfs *pVfs, const char *zPath, - int flags, int *pResOut); -static int dftracer_sqlite_vfs_fullpathname(sqlite3_vfs *pVfs, - const char *zName, int nOut, - char *zOut); -static int dftracer_sqlite_vfs_get_last_error(sqlite3_vfs *pVfs, int nBuf, - char *zBuf); - -// Forward declarations of all io_methods -static int dftracer_sqlite_vfs_close(sqlite3_file *pFile); -static int dftracer_sqlite_vfs_read(sqlite3_file *pFile, void *buf, int amt, - sqlite3_int64 offset); -static int dftracer_sqlite_vfs_write(sqlite3_file *pFile, const void *buf, - int amt, sqlite3_int64 offset); -static int dftracer_sqlite_vfs_truncate(sqlite3_file *pFile, - sqlite3_int64 size); -static int dftracer_sqlite_vfs_sync(sqlite3_file *pFile, int flags); -static int dftracer_sqlite_vfs_file_size(sqlite3_file *pFile, - sqlite3_int64 *pSize); -static int dftracer_sqlite_vfs_lock(sqlite3_file *pFile, int eLock); -static int dftracer_sqlite_vfs_unlock(sqlite3_file *pFile, int eLock); -static int dftracer_sqlite_vfs_check_reserved_lock(sqlite3_file *pFile, - int *pResOut); -static int dftracer_sqlite_vfs_file_control(sqlite3_file *pFile, int op, - void *pArg); -static int dftracer_sqlite_vfs_sector_size(sqlite3_file *pFile); -static int dftracer_sqlite_vfs_device_characteristics(sqlite3_file *pFile); -static int dftracer_sqlite_vfs_shm_map(sqlite3_file *pFile, int iRegion, - int szRegion, int bExtend, - void volatile **pp); -static int dftracer_sqlite_vfs_shm_lock(sqlite3_file *pFile, int offset, int n, - int flags); -static void dftracer_sqlite_vfs_shm_barrier(sqlite3_file *pFile); -static int dftracer_sqlite_vfs_shm_unmap(sqlite3_file *pFile, int deleteFlag); -static int dftracer_sqlite_vfs_fetch(sqlite3_file *pFile, sqlite3_int64 offset, - int amt, void **pp); -static int dftracer_sqlite_vfs_unfetch(sqlite3_file *pFile, - sqlite3_int64 offset, void *p); - -// Static io_methods struct (iVersion=3 for WAL + mmap) -static sqlite3_io_methods dftracer_sqlite_vfs_io_methods = { - 3, // iVersion - dftracer_sqlite_vfs_close, // xClose - dftracer_sqlite_vfs_read, // xRead - dftracer_sqlite_vfs_write, // xWrite - dftracer_sqlite_vfs_truncate, // xTruncate - dftracer_sqlite_vfs_sync, // xSync - dftracer_sqlite_vfs_file_size, // xFileSize - dftracer_sqlite_vfs_lock, // xLock - dftracer_sqlite_vfs_unlock, // xUnlock - dftracer_sqlite_vfs_check_reserved_lock, // xCheckReservedLock - dftracer_sqlite_vfs_file_control, // xFileControl - dftracer_sqlite_vfs_sector_size, // xSectorSize - dftracer_sqlite_vfs_device_characteristics, // xDeviceCharacteristics - dftracer_sqlite_vfs_shm_map, // xShmMap - dftracer_sqlite_vfs_shm_lock, // xShmLock - dftracer_sqlite_vfs_shm_barrier, // xShmBarrier - dftracer_sqlite_vfs_shm_unmap, // xShmUnmap - dftracer_sqlite_vfs_fetch, // xFetch - dftracer_sqlite_vfs_unfetch, // xUnfetch -}; - -// Static VFS instance and app data -static sqlite3_vfs dftracer_vfs_instance; -static DfTracerSqliteVfsAppData *dftracer_vfs_app_data = nullptr; -static bool dftracer_vfs_registered = false; - -// ============================================================================ -// sqlite3_io_methods implementations -// ============================================================================ - -static int dftracer_sqlite_vfs_close(sqlite3_file *pFile) { - auto *vf = reinterpret_cast(pFile); - - // Clean up SHM resources - for (int i = 0; i < vf->n_shm_region; ++i) { - if (vf->shm_regions[i] != nullptr) { - ::munmap(vf->shm_regions[i], 32768); - vf->shm_regions[i] = nullptr; - } - } - if (vf->shm_fd >= 0) { - ::close(vf->shm_fd); - vf->shm_fd = -1; - } - - if (vf->fd >= 0) { - ::close(vf->fd); - vf->fd = -1; - } - - vf->path[0] = '\0'; - - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_read(sqlite3_file *pFile, void *buf, int amt, - sqlite3_int64 offset) { - auto *vf = reinterpret_cast(pFile); - - if (vf->backend == nullptr) { - ssize_t n = ::pread(vf->fd, buf, static_cast(amt), - static_cast(offset)); - if (n == amt) return SQLITE_OK; - if (n >= 0) { - std::memset(static_cast(buf) + n, 0, amt - n); - return SQLITE_IOERR_SHORT_READ; - } - return SQLITE_IOERR_READ; - } - - ssize_t result = vf->backend->submit_read_sync( - vf->fd, buf, static_cast(amt), static_cast(offset)); - - if (result == amt) return SQLITE_OK; - if (result >= 0) { - std::memset(static_cast(buf) + result, 0, amt - result); - return SQLITE_IOERR_SHORT_READ; - } - return SQLITE_IOERR_READ; -} - -static int dftracer_sqlite_vfs_write(sqlite3_file *pFile, const void *buf, - int amt, sqlite3_int64 offset) { - auto *vf = reinterpret_cast(pFile); - - if (vf->backend == nullptr) { - ssize_t n = ::pwrite(vf->fd, buf, static_cast(amt), - static_cast(offset)); - if (n == amt) return SQLITE_OK; - return SQLITE_IOERR_WRITE; - } - - ssize_t result = vf->backend->submit_write_sync( - vf->fd, buf, static_cast(amt), static_cast(offset)); - - if (result == amt) return SQLITE_OK; - return SQLITE_IOERR_WRITE; -} - -static int dftracer_sqlite_vfs_truncate(sqlite3_file *pFile, - sqlite3_int64 size) { - auto *vf = reinterpret_cast(pFile); - - if (vf->backend == nullptr) { - if (::ftruncate(vf->fd, static_cast(size)) != 0) { - return SQLITE_IOERR_TRUNCATE; - } - return SQLITE_OK; - } - - int rc = - vf->backend->submit_ftruncate_sync(vf->fd, static_cast(size)); - if (rc != 0) return SQLITE_IOERR_TRUNCATE; - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_sync(sqlite3_file *pFile, int /*flags*/) { - auto *vf = reinterpret_cast(pFile); - - if (vf->backend == nullptr) { - if (::fsync(vf->fd) != 0) return SQLITE_IOERR_FSYNC; - return SQLITE_OK; - } - - int rc = vf->backend->submit_fsync_sync(vf->fd); - if (rc != 0) return SQLITE_IOERR_FSYNC; - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_file_size(sqlite3_file *pFile, - sqlite3_int64 *pSize) { - auto *vf = reinterpret_cast(pFile); - struct stat st; - - if (vf->backend == nullptr) { - if (::fstat(vf->fd, &st) != 0) return SQLITE_IOERR_FSTAT; - *pSize = st.st_size; - return SQLITE_OK; - } - - int rc = vf->backend->submit_fstat_sync(vf->fd, &st); - if (rc != 0) return SQLITE_IOERR_FSTAT; - *pSize = st.st_size; - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_lock(sqlite3_file *pFile, int eLock) { - auto *vf = reinterpret_cast(pFile); - - struct flock fl; - std::memset(&fl, 0, sizeof(fl)); - - if (eLock == SQLITE_LOCK_NONE) { - return SQLITE_OK; - } - - if (eLock == SQLITE_LOCK_SHARED) { - fl.l_type = F_RDLCK; - } else { - fl.l_type = F_WRLCK; - } - fl.l_whence = SEEK_SET; - fl.l_start = 0; - fl.l_len = 0; - - if (::fcntl(vf->fd, F_SETLK, &fl) == -1) { - if (errno == EACCES || errno == EAGAIN) { - return SQLITE_BUSY; - } - return SQLITE_IOERR_LOCK; - } - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_unlock(sqlite3_file *pFile, int /*eLock*/) { - auto *vf = reinterpret_cast(pFile); - - struct flock fl; - std::memset(&fl, 0, sizeof(fl)); - fl.l_type = F_UNLCK; - fl.l_whence = SEEK_SET; - fl.l_start = 0; - fl.l_len = 0; - - if (::fcntl(vf->fd, F_SETLK, &fl) == -1) { - return SQLITE_IOERR_UNLOCK; - } - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_check_reserved_lock(sqlite3_file *pFile, - int *pResOut) { - auto *vf = reinterpret_cast(pFile); - - struct flock fl; - std::memset(&fl, 0, sizeof(fl)); - fl.l_type = F_WRLCK; - fl.l_whence = SEEK_SET; - fl.l_start = 0; - fl.l_len = 1; - - if (::fcntl(vf->fd, F_GETLK, &fl) == -1) { - *pResOut = 0; - return SQLITE_IOERR_CHECKRESERVEDLOCK; - } - - *pResOut = (fl.l_type != F_UNLCK) ? 1 : 0; - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_file_control(sqlite3_file * /*pFile*/, int op, - void * /*pArg*/) { - if (op == SQLITE_FCNTL_LOCKSTATE) { - return SQLITE_OK; - } - return SQLITE_NOTFOUND; -} - -static int dftracer_sqlite_vfs_sector_size(sqlite3_file * /*pFile*/) { - return 4096; -} - -static int dftracer_sqlite_vfs_device_characteristics( - sqlite3_file * /*pFile*/) { - return SQLITE_IOCAP_ATOMIC512 | SQLITE_IOCAP_SAFE_APPEND; -} - -// ============================================================================ -// SHM methods (WAL shared memory) -// ============================================================================ - -static int dftracer_sqlite_vfs_shm_map(sqlite3_file *pFile, int iRegion, - int szRegion, int bExtend, - void volatile **pp) { - auto *vf = reinterpret_cast(pFile); - - if (iRegion >= 32) { - *pp = nullptr; - return SQLITE_IOERR; - } - - // Open SHM file if not yet opened - if (vf->shm_fd < 0) { - char shm_path[1024]; - std::snprintf(shm_path, sizeof(shm_path), "%s-shm", vf->path); - int oflags = O_RDWR | O_CREAT; - vf->shm_fd = ::open(shm_path, oflags, 0644); - if (vf->shm_fd < 0) { - *pp = nullptr; - return SQLITE_IOERR; - } - } - - // Extend file if needed - off_t required_size = - static_cast(iRegion + 1) * static_cast(szRegion); - struct stat st; - if (::fstat(vf->shm_fd, &st) != 0) { - *pp = nullptr; - return SQLITE_IOERR; - } - if (st.st_size < required_size) { - if (!bExtend) { - *pp = nullptr; - return SQLITE_OK; - } - if (::ftruncate(vf->shm_fd, required_size) != 0) { - *pp = nullptr; - return SQLITE_IOERR; - } - } - - // Map the region if not already mapped - if (iRegion >= vf->n_shm_region || vf->shm_regions[iRegion] == nullptr) { - off_t map_offset = - static_cast(iRegion) * static_cast(szRegion); - void *mapped = - ::mmap(nullptr, static_cast(szRegion), - PROT_READ | PROT_WRITE, MAP_SHARED, vf->shm_fd, map_offset); - if (mapped == MAP_FAILED) { - *pp = nullptr; - return SQLITE_IOERR; - } - vf->shm_regions[iRegion] = mapped; - if (iRegion >= vf->n_shm_region) { - vf->n_shm_region = iRegion + 1; - } - } - - *pp = vf->shm_regions[iRegion]; - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_shm_lock(sqlite3_file *pFile, int offset, int n, - int flags) { - auto *vf = reinterpret_cast(pFile); - - if (vf->shm_fd < 0) return SQLITE_IOERR; - - struct flock fl; - std::memset(&fl, 0, sizeof(fl)); - - if (flags & SQLITE_SHM_UNLOCK) { - fl.l_type = F_UNLCK; - } else if (flags & SQLITE_SHM_EXCLUSIVE) { - fl.l_type = F_WRLCK; - } else { - fl.l_type = F_RDLCK; - } - fl.l_whence = SEEK_SET; - fl.l_start = offset; - fl.l_len = n; - - if (::fcntl(vf->shm_fd, F_SETLK, &fl) == -1) { - if (errno == EACCES || errno == EAGAIN) { - return SQLITE_BUSY; - } - return SQLITE_IOERR; - } - return SQLITE_OK; -} - -static void dftracer_sqlite_vfs_shm_barrier(sqlite3_file * /*pFile*/) { - __atomic_thread_fence(__ATOMIC_SEQ_CST); -} - -static int dftracer_sqlite_vfs_shm_unmap(sqlite3_file *pFile, int deleteFlag) { - auto *vf = reinterpret_cast(pFile); - - for (int i = 0; i < vf->n_shm_region; ++i) { - if (vf->shm_regions[i] != nullptr) { - ::munmap(vf->shm_regions[i], 32768); - vf->shm_regions[i] = nullptr; - } - } - vf->n_shm_region = 0; - - if (vf->shm_fd >= 0) { - ::close(vf->shm_fd); - if (deleteFlag) { - char shm_path[1024]; - std::snprintf(shm_path, sizeof(shm_path), "%s-shm", vf->path); - ::unlink(shm_path); - } - vf->shm_fd = -1; - } - - return SQLITE_OK; -} - -// ============================================================================ -// mmap methods (version 3) -// ============================================================================ - -static int dftracer_sqlite_vfs_fetch(sqlite3_file * /*pFile*/, - sqlite3_int64 /*offset*/, int /*amt*/, - void **pp) { - // Disable mmap — returning nullptr tells SQLite to use - // xRead instead. This avoids tracking mmap sizes for munmap. - *pp = nullptr; - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_unfetch(sqlite3_file * /*pFile*/, - sqlite3_int64 /*offset*/, void *p) { - (void)p; - return SQLITE_OK; -} - -// ============================================================================ -// sqlite3_vfs implementations -// ============================================================================ - -static int dftracer_sqlite_vfs_open(sqlite3_vfs *pVfs, const char *zName, - sqlite3_file *pFile, int flags, - int *pOutFlags) { - auto *app = static_cast(pVfs->pAppData); - auto *vf = reinterpret_cast(pFile); - - // Zero the sqlite3_file base (C struct, safe to memset) - std::memset(&vf->base, 0, sizeof(vf->base)); - vf->backend = nullptr; - vf->executor = nullptr; - vf->fd = -1; - vf->read_only = false; - vf->shm_fd = -1; - vf->n_shm_region = 0; - for (int i = 0; i < 32; ++i) { - vf->shm_regions[i] = nullptr; - } - - std::snprintf(vf->path, sizeof(vf->path), "%s", zName ? zName : ""); - - vf->backend = app ? app->backend : nullptr; - vf->executor = app ? app->executor : nullptr; - vf->read_only = (flags & SQLITE_OPEN_READONLY) != 0; - - // Build open flags - int oflags = 0; - if (flags & SQLITE_OPEN_EXCLUSIVE) { - oflags |= O_EXCL; - } - if (flags & SQLITE_OPEN_CREATE) { - oflags |= O_CREAT; - } - if (flags & SQLITE_OPEN_READONLY) { - oflags = O_RDONLY; - } else if (flags & SQLITE_OPEN_READWRITE) { - oflags |= O_RDWR; - } - - // Handle temp/journal files without a name - if (zName == nullptr) { - const char *tmpdir = std::getenv("TMPDIR"); - if (!tmpdir) tmpdir = "/tmp"; - std::snprintf(vf->path, sizeof(vf->path), "%s/dftracer_sqlite_XXXXXX", - tmpdir); - vf->fd = ::mkstemp(vf->path); - if (vf->fd < 0) { - return SQLITE_CANTOPEN; - } - ::unlink(vf->path); - } else { - vf->fd = ::open(zName, oflags, 0644); - if (vf->fd < 0) { - return SQLITE_CANTOPEN; - } - } - - if (pOutFlags != nullptr) { - *pOutFlags = flags; - } - - pFile->pMethods = &dftracer_sqlite_vfs_io_methods; - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_delete(sqlite3_vfs * /*pVfs*/, const char *zPath, - int dirSync) { - if (::unlink(zPath) != 0) { - if (errno == ENOENT) return SQLITE_OK; - return SQLITE_IOERR_DELETE; - } - - if (dirSync) { - // Sync the parent directory - char dir[1024]; - std::snprintf(dir, sizeof(dir), "%s", zPath); - char *slash = std::strrchr(dir, '/'); - if (slash) { - if (slash == dir) { - dir[1] = '\0'; // root "/" - } else { - *slash = '\0'; - } - } else { - dir[0] = '.'; - dir[1] = '\0'; - } - int dfd = ::open(dir, O_RDONLY); - if (dfd >= 0) { - ::fsync(dfd); - ::close(dfd); - } - } - - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_access(sqlite3_vfs * /*pVfs*/, const char *zPath, - int flags, int *pResOut) { - int mode = F_OK; - if (flags == SQLITE_ACCESS_READWRITE) { - mode = R_OK | W_OK; - } else if (flags == SQLITE_ACCESS_READ) { - mode = R_OK; - } - - *pResOut = (::access(zPath, mode) == 0) ? 1 : 0; - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_fullpathname(sqlite3_vfs * /*pVfs*/, - const char *zName, int nOut, - char *zOut) { - char *resolved = ::realpath(zName, nullptr); - if (resolved != nullptr) { - std::strncpy(zOut, resolved, static_cast(nOut)); - zOut[nOut - 1] = '\0'; - ::free(resolved); - } else { - // If realpath fails (file doesn't exist yet), copy as-is - std::strncpy(zOut, zName, static_cast(nOut)); - zOut[nOut - 1] = '\0'; - } - return SQLITE_OK; -} - -static int dftracer_sqlite_vfs_get_last_error(sqlite3_vfs * /*pVfs*/, int nBuf, - char *zBuf) { - if (nBuf > 0 && zBuf != nullptr) { - std::strncpy(zBuf, std::strerror(errno), - static_cast(nBuf)); - zBuf[nBuf - 1] = '\0'; - } - return errno; -} - -// ============================================================================ -// VFS Registration -// ============================================================================ - -void register_dftracer_sqlite_vfs(io::IoBackend *backend, Executor *executor) { - if (dftracer_vfs_registered) return; - - sqlite3_vfs *default_vfs = sqlite3_vfs_find(nullptr); - - dftracer_vfs_app_data = new DfTracerSqliteVfsAppData{backend, executor}; - - std::memset(&dftracer_vfs_instance, 0, sizeof(dftracer_vfs_instance)); - dftracer_vfs_instance.iVersion = 3; - dftracer_vfs_instance.szOsFile = - static_cast(sizeof(DfTracerSqliteVfsFile)); - dftracer_vfs_instance.mxPathname = VFS_MAX_PATHNAME; - dftracer_vfs_instance.pNext = nullptr; - dftracer_vfs_instance.zName = "dftracer_sqlite"; - dftracer_vfs_instance.pAppData = dftracer_vfs_app_data; - dftracer_vfs_instance.xOpen = dftracer_sqlite_vfs_open; - dftracer_vfs_instance.xDelete = dftracer_sqlite_vfs_delete; - dftracer_vfs_instance.xAccess = dftracer_sqlite_vfs_access; - dftracer_vfs_instance.xFullPathname = dftracer_sqlite_vfs_fullpathname; - dftracer_vfs_instance.xGetLastError = dftracer_sqlite_vfs_get_last_error; - - // Delegate time/random/sleep to default VFS - if (default_vfs != nullptr) { - dftracer_vfs_instance.xRandomness = default_vfs->xRandomness; - dftracer_vfs_instance.xSleep = default_vfs->xSleep; - dftracer_vfs_instance.xCurrentTime = default_vfs->xCurrentTime; - dftracer_vfs_instance.xCurrentTimeInt64 = - default_vfs->xCurrentTimeInt64; - } - - sqlite3_vfs_register(&dftracer_vfs_instance, 0); - dftracer_vfs_registered = true; -} - -void unregister_dftracer_sqlite_vfs() { - if (!dftracer_vfs_registered) return; - - sqlite3_vfs_unregister(&dftracer_vfs_instance); - - delete dftracer_vfs_app_data; - dftracer_vfs_app_data = nullptr; - dftracer_vfs_registered = false; -} - -} // namespace dftracer::utils::sqlite diff --git a/src/dftracer/utils/python/indexer.cpp b/src/dftracer/utils/python/indexer.cpp index c82a3525..071a6986 100644 --- a/src/dftracer/utils/python/indexer.cpp +++ b/src/dftracer/utils/python/indexer.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -11,14 +12,27 @@ static void Indexer_dealloc(IndexerObject *self) { if (self->handle) { + // The Python wrapper owns only the native indexer handle. The + // underlying RocksDB instance remains manager-owned and may continue to + // live process-wide for the same .dftindex path. dft_indexer_destroy(self->handle); + self->handle = NULL; } Py_XDECREF(self->gz_path); - Py_XDECREF(self->idx_path); + Py_XDECREF(self->index_path); Py_XDECREF(self->runtime_obj); Py_TYPE(self)->tp_free((PyObject *)self); } +static void Indexer_release_handle(IndexerObject *self) { + if (self->handle) { + // Releasing the handle drops this wrapper's native indexer state only. + // Shared RocksDB lifetime is managed separately by RocksDBManager. + dft_indexer_destroy(self->handle); + self->handle = NULL; + } +} + static PyObject *Indexer_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { IndexerObject *self; @@ -26,7 +40,7 @@ static PyObject *Indexer_new(PyTypeObject *type, PyObject *args, if (self != NULL) { self->handle = NULL; self->gz_path = NULL; - self->idx_path = NULL; + self->index_path = NULL; self->checkpoint_size = 0; self->build_bloom = 0; self->build_manifest = 0; @@ -39,11 +53,11 @@ static PyObject *Indexer_new(PyTypeObject *type, PyObject *args, static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) { static const char *kwlist[] = { - "gz_path", "idx_path", "checkpoint_size", + "gz_path", "index_path", "checkpoint_size", "force_rebuild", "build_bloom", "build_manifest", "index_threshold", "runtime", NULL}; const char *gz_path; - const char *idx_path = NULL; + const char *index_path = NULL; std::uint64_t checkpoint_size = dftracer::utils::constants::indexer::DEFAULT_CHECKPOINT_SIZE; int force_rebuild = 0; @@ -54,7 +68,7 @@ static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) { PyObject *runtime_arg = NULL; if (!PyArg_ParseTupleAndKeywords( - args, kwds, "s|snpppnO", (char **)kwlist, &gz_path, &idx_path, + args, kwds, "s|snpppnO", (char **)kwlist, &gz_path, &index_path, &checkpoint_size, &force_rebuild, &build_bloom, &build_manifest, &index_threshold, &runtime_arg)) { return -1; @@ -82,15 +96,15 @@ static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) { return -1; } - if (idx_path) { - self->idx_path = PyUnicode_FromString(idx_path); + if (index_path) { + self->index_path = PyUnicode_FromString(index_path); } else { - PyObject *gz_path_obj = PyUnicode_FromString(gz_path); - self->idx_path = PyUnicode_FromFormat("%U.idx", gz_path_obj); - Py_DECREF(gz_path_obj); + const std::string index_path = dftracer::utils::utilities::composites:: + dft::internal::determine_index_path(gz_path, ""); + self->index_path = PyUnicode_FromString(index_path.c_str()); } - if (!self->idx_path) { + if (!self->index_path) { Py_DECREF(self->gz_path); return -1; } @@ -100,12 +114,12 @@ static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) { self->build_manifest = build_manifest; self->index_threshold = index_threshold; - const char *idx_path_str = PyUnicode_AsUTF8(self->idx_path); - if (!idx_path_str) { + const char *index_path_str = PyUnicode_AsUTF8(self->index_path); + if (!index_path_str) { return -1; } - self->handle = dft_indexer_create(gz_path, idx_path_str, checkpoint_size, + self->handle = dft_indexer_create(gz_path, index_path_str, checkpoint_size, force_rebuild); if (!self->handle) { PyErr_SetString(PyExc_RuntimeError, "Failed to create indexer"); @@ -133,7 +147,7 @@ static PyObject *Indexer_build(IndexerObject *self, using namespace dftracer::utils::utilities::indexer; const char *gz = PyUnicode_AsUTF8(self->gz_path); - const char *idx = PyUnicode_AsUTF8(self->idx_path); + const char *idx = PyUnicode_AsUTF8(self->index_path); if (!gz || !idx) { return NULL; } @@ -296,7 +310,7 @@ static PyObject *Indexer_get_checkpoints(IndexerObject *self, } static PyObject *Indexer_has_bloom(IndexerObject *self, void *closure) { - const char *idx = PyUnicode_AsUTF8(self->idx_path); + const char *idx = PyUnicode_AsUTF8(self->index_path); const char *gz = PyUnicode_AsUTF8(self->gz_path); if (!idx || !gz) { Py_RETURN_FALSE; @@ -304,7 +318,8 @@ static PyObject *Indexer_has_bloom(IndexerObject *self, void *closure) { try { using namespace dftracer::utils::utilities::indexer; using namespace dftracer::utils::utilities::indexer::internal; - IndexDatabase db(idx); + IndexDatabase db( + idx, dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); std::string logical = get_logical_path(gz); int fid = db.get_file_info_id(logical); if (fid >= 0 && db.has_bloom_data(fid)) { @@ -316,7 +331,7 @@ static PyObject *Indexer_has_bloom(IndexerObject *self, void *closure) { } static PyObject *Indexer_has_manifest(IndexerObject *self, void *closure) { - const char *idx = PyUnicode_AsUTF8(self->idx_path); + const char *idx = PyUnicode_AsUTF8(self->index_path); const char *gz = PyUnicode_AsUTF8(self->gz_path); if (!idx || !gz) { Py_RETURN_FALSE; @@ -324,7 +339,8 @@ static PyObject *Indexer_has_manifest(IndexerObject *self, void *closure) { try { using namespace dftracer::utils::utilities::indexer; using namespace dftracer::utils::utilities::indexer::internal; - IndexDatabase db(idx); + IndexDatabase db( + idx, dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); std::string logical = get_logical_path(gz); int fid = db.get_file_info_id(logical); if (fid >= 0 && db.has_manifest_data(fid)) { @@ -340,9 +356,9 @@ static PyObject *Indexer_gz_path(IndexerObject *self, void *closure) { return self->gz_path; } -static PyObject *Indexer_idx_path(IndexerObject *self, void *closure) { - Py_INCREF(self->idx_path); - return self->idx_path; +static PyObject *Indexer_index_path(IndexerObject *self, void *closure) { + Py_INCREF(self->index_path); + return self->index_path; } static PyObject *Indexer_checkpoint_size(IndexerObject *self, void *closure) { @@ -355,7 +371,14 @@ static PyObject *Indexer_enter(IndexerObject *self, return (PyObject *)self; } +static PyObject *Indexer_close(IndexerObject *self, + PyObject *Py_UNUSED(ignored)) { + Indexer_release_handle(self); + Py_RETURN_NONE; +} + static PyObject *Indexer_exit(IndexerObject *self, PyObject *args) { + Indexer_release_handle(self); Py_RETURN_NONE; } @@ -368,7 +391,7 @@ static PyMethodDef Indexer_methods[] = { {"need_rebuild", (PyCFunction)Indexer_need_rebuild, METH_NOARGS, "Check if a rebuild is needed."}, {"exists", (PyCFunction)Indexer_exists, METH_NOARGS, - "Check if the index file exists."}, + "Check if the .dftindex store exists."}, {"get_max_bytes", (PyCFunction)Indexer_get_max_bytes, METH_NOARGS, "Get the maximum uncompressed bytes in the indexed file."}, {"get_num_lines", (PyCFunction)Indexer_get_num_lines, METH_NOARGS, @@ -380,17 +403,25 @@ static PyMethodDef Indexer_methods[] = { " offset (int): Uncompressed byte offset.\n"}, {"get_checkpoints", (PyCFunction)Indexer_get_checkpoints, METH_NOARGS, "Get all checkpoints for this file as a list."}, + {"close", (PyCFunction)Indexer_close, METH_NOARGS, + "Release this Python wrapper's native indexer handle.\n" + "\n" + "The shared RocksDB instance for the same .dftindex path remains managed\n" + "by the native RocksDBManager cache."}, {"__enter__", (PyCFunction)Indexer_enter, METH_NOARGS, "Enter the runtime context for the with statement."}, {"__exit__", (PyCFunction)Indexer_exit, METH_VARARGS, - "Exit the runtime context for the with statement."}, + "Release this Python wrapper on context exit.\n" + "\n" + "This does not force-close the shared RocksDB instance for the same\n" + ".dftindex path."}, {NULL} /* Sentinel */ }; static PyGetSetDef Indexer_getsetters[] = { {"gz_path", (getter)Indexer_gz_path, NULL, "Path to the gzip file", NULL}, - {"idx_path", (getter)Indexer_idx_path, NULL, "Path to the index file", - NULL}, + {"index_path", (getter)Indexer_index_path, NULL, + "Path to the .dftindex store", NULL}, {"checkpoint_size", (getter)Indexer_checkpoint_size, NULL, "Checkpoint size in bytes", NULL}, {"has_bloom", (getter)Indexer_has_bloom, NULL, @@ -420,7 +451,7 @@ PyTypeObject IndexerType = { 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ - "Indexer(gz_path: str, idx_path: str | None = None,\n" + "Indexer(gz_path: str, index_path: str | None = None,\n" " checkpoint_size: int = 1048576,\n" " force_rebuild: bool = False, build_bloom: bool = False,\n" " build_manifest: bool = False,\n" @@ -428,20 +459,20 @@ PyTypeObject IndexerType = { " runtime: Runtime | None = None)\n" "--\n" "\n" - "Indexer for creating and managing gzip file indices.\n" + "Indexer for creating and managing gzip trace index stores.\n" "\n" "Args:\n" " gz_path (str): Path to the gzip trace file.\n" - " idx_path (str or None): Path to the index file. If None,\n" - " uses gz_path + \".idx\".\n" + " index_path (str or None): Path to the .dftindex store. If None,\n" + " uses the root-local \".dftindex\" next to gz_path.\n" " checkpoint_size (int): Checkpoint size in bytes for index\n" " building (default 1 MB).\n" " force_rebuild (bool): If True, rebuild the index even if it\n" " exists.\n" " build_bloom (bool): If True, build bloom filter data in the\n" - " index.\n" + " store.\n" " build_manifest (bool): If True, build manifest data in the\n" - " index.\n" + " store.\n" " index_threshold (int): Skip indexing for files smaller than\n" " this (default 1 MB).\n" " runtime (Runtime or None): Runtime instance for thread pool\n" diff --git a/src/dftracer/utils/python/indexer.h b/src/dftracer/utils/python/indexer.h index 536ca97e..d31d0ccf 100644 --- a/src/dftracer/utils/python/indexer.h +++ b/src/dftracer/utils/python/indexer.h @@ -9,7 +9,7 @@ typedef struct { PyObject_HEAD dft_indexer_handle_t handle; PyObject *gz_path; - PyObject *idx_path; + PyObject *index_path; std::uint64_t checkpoint_size; int build_bloom; int build_manifest; diff --git a/src/dftracer/utils/python/trace_reader.cpp b/src/dftracer/utils/python/trace_reader.cpp index 5d0dcb75..f50c0e33 100644 --- a/src/dftracer/utils/python/trace_reader.cpp +++ b/src/dftracer/utils/python/trace_reader.cpp @@ -895,7 +895,8 @@ static PyObject *TraceReader_iter_lines(TraceReaderObject *self, PyObject *args, Runtime *rt = get_runtime(self); try { - rt->submit(produce_lines(state, cfg, rc), "iter_lines"); + auto handle = rt->submit(produce_lines(state, cfg, rc), "iter_lines"); + state->task_future = handle.future; } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; @@ -954,7 +955,8 @@ static PyObject *TraceReader_iter_raw(TraceReaderObject *self, PyObject *args, Runtime *rt = get_runtime(self); try { - rt->submit(produce_raw(state, cfg, rc), "iter_raw"); + auto handle = rt->submit(produce_raw(state, cfg, rc), "iter_raw"); + state->task_future = handle.future; } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; @@ -1026,7 +1028,9 @@ static PyObject *TraceReader_iter_lines_json(TraceReaderObject *self, Runtime *rt = get_runtime(self); try { - rt->submit(produce_lines(state, cfg, rc), "iter_lines_json"); + auto handle = + rt->submit(produce_lines(state, cfg, rc), "iter_lines_json"); + state->task_future = handle.future; } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; @@ -1100,10 +1104,12 @@ static PyObject *TraceReader_iter_arrow(TraceReaderObject *self, PyObject *args, Runtime *rt = get_runtime(self); try { - rt->submit(produce_arrow_batches(state, cfg, rc, - static_cast(batch_size), - flatten_objects != 0, normalize != 0), - "iter_arrow"); + auto handle = + rt->submit(produce_arrow_batches( + state, cfg, rc, static_cast(batch_size), + flatten_objects != 0, normalize != 0), + "iter_arrow"); + state->task_future = handle.future; } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; @@ -1294,7 +1300,10 @@ static PyMethodDef TraceReader_methods[] = { {"__enter__", (PyCFunction)TraceReader_enter, METH_NOARGS, "Enter the runtime context for the with statement."}, {"__exit__", (PyCFunction)TraceReader_exit, METH_VARARGS, - "Exit the runtime context for the with statement."}, + "Exit the runtime context for the with statement.\n" + "\n" + "TraceReader does not own the shared RocksDB instance for an index path;\n" + "any shared DB lifetime remains manager-owned on the native side."}, {NULL}}; static PyGetSetDef TraceReader_getsetters[] = { @@ -1336,13 +1345,13 @@ PyTypeObject TraceReaderType = { "--\n" "\n" "Smart trace file reader that auto-selects sequential or indexed\n" - "reading based on whether an ``.idx`` sidecar exists.\n" + "reading based on whether a ``.dftindex`` store exists.\n" "\n" "Args:\n" " file_path (str): Path to the trace file (.pfw.gz or plain " "text).\n" - " index_dir (str): Directory to search for ``.idx`` sidecar " - "files.\n" + " index_dir (str): Directory to search for ``.dftindex`` " + "stores.\n" " Empty string (default) searches next to the trace file.\n" " checkpoint_size (int): Checkpoint interval in bytes for index\n" " building (default 32 MB).\n" diff --git a/src/dftracer/utils/python/trace_reader_iterator.cpp b/src/dftracer/utils/python/trace_reader_iterator.cpp index f3a669db..87bf54a5 100644 --- a/src/dftracer/utils/python/trace_reader_iterator.cpp +++ b/src/dftracer/utils/python/trace_reader_iterator.cpp @@ -138,17 +138,37 @@ PyTypeObject ArrowBatchCapsuleType = { static void TraceReaderIterator_dealloc(TraceReaderIteratorObject *self) { #ifdef DFTRACER_UTILS_ENABLE_ARROW if (self->arrow_state) { + auto task_future = self->arrow_state->task_future; self->arrow_state->cancelled.store(true, std::memory_order_release); self->arrow_state->cv_producer.notify_all(); self->arrow_state->cv_consumer.notify_all(); // wake blocked __next__ - self->arrow_state.reset(); + Py_BEGIN_ALLOW_THREADS { + std::unique_lock lock(self->arrow_state->mtx); + self->arrow_state->cv_consumer.wait(lock, [self] { + return self->arrow_state->done.load(std::memory_order_acquire); + }); + } + if (task_future.valid()) { + task_future.wait(); + } + Py_END_ALLOW_THREADS self->arrow_state.reset(); } #endif if (self->state) { + auto task_future = self->state->task_future; self->state->cancelled.store(true, std::memory_order_release); self->state->cv_producer.notify_all(); self->state->cv_consumer.notify_all(); // wake blocked __next__ - self->state.reset(); + Py_BEGIN_ALLOW_THREADS { + std::unique_lock lock(self->state->mtx); + self->state->cv_consumer.wait(lock, [self] { + return self->state->done.load(std::memory_order_acquire); + }); + } + if (task_future.valid()) { + task_future.wait(); + } + Py_END_ALLOW_THREADS self->state.reset(); } Py_TYPE(self)->tp_free((PyObject *)self); } diff --git a/src/dftracer/utils/python/trace_reader_iterator.h b/src/dftracer/utils/python/trace_reader_iterator.h index 8c985721..11941fd8 100644 --- a/src/dftracer/utils/python/trace_reader_iterator.h +++ b/src/dftracer/utils/python/trace_reader_iterator.h @@ -2,6 +2,7 @@ #define DFTRACER_UTILS_PYTHON_TRACE_READER_ITERATOR_H #include +#include #include #include @@ -40,6 +41,7 @@ struct IteratorState { std::atomic cancelled{false}; std::atomic done{false}; std::size_t max_queue_size = 64; + std::shared_future task_future; }; #ifdef DFTRACER_UTILS_ENABLE_ARROW @@ -54,6 +56,7 @@ struct ArrowIteratorState { std::atomic cancelled{false}; std::atomic done{false}; std::size_t max_queue_size = 8; + std::shared_future task_future; }; #endif diff --git a/src/dftracer/utils/python/utilities/aggregator.cpp b/src/dftracer/utils/python/utilities/aggregator.cpp index 02154697..6f7799d4 100644 --- a/src/dftracer/utils/python/utilities/aggregator.cpp +++ b/src/dftracer/utils/python/utilities/aggregator.cpp @@ -302,7 +302,7 @@ static PyMethodDef Aggregator_methods[] = { " group_keys (list[str] or None): Extra grouping dims (default None).\n" " categories (list[str] or None): Category filter (default None).\n" " names (list[str] or None): Name filter (default None).\n" - " index_dir (str): Index sidecar directory (default '').\n" + " index_dir (str): Directory for .dftindex stores (default '').\n" " checkpoint_size (int): Checkpoint size (default 33554432).\n" " force_rebuild (bool): Force index rebuild (default False).\n" " chunk_size_mb (int): Target chunk size in MB (default 64).\n" @@ -334,7 +334,7 @@ static PyMethodDef Aggregator_methods[] = { " group_keys (list[str] or None): Extra grouping dims (default None).\n" " categories (list[str] or None): Category filter (default None).\n" " names (list[str] or None): Name filter (default None).\n" - " index_dir (str): Index sidecar directory (default '').\n" + " index_dir (str): Directory for .dftindex stores (default '').\n" " checkpoint_size (int): Checkpoint size (default 33554432).\n" " force_rebuild (bool): Force index rebuild (default False).\n" " chunk_size_mb (int): Target chunk size in MB (default 64).\n" diff --git a/src/dftracer/utils/python/utilities/comparator.cpp b/src/dftracer/utils/python/utilities/comparator.cpp index b9b842b4..c377fac7 100644 --- a/src/dftracer/utils/python/utilities/comparator.cpp +++ b/src/dftracer/utils/python/utilities/comparator.cpp @@ -195,7 +195,7 @@ CoroTask run_aggregation( -> CoroTask { [[maybe_unused]] auto producer_guard = ch.guard(); - std::string idx_path = + std::string index_path = composites::dft::internal::determine_index_path( file_path, index_dir); @@ -204,7 +204,7 @@ CoroTask run_aggregation( from_file(file_path) .with_checkpoint_size(checkpoint_size) .with_force_rebuild(force_rebuild) - .with_index(idx_path); + .with_index(index_path); auto metadata = co_await composites::dft::MetadataCollectorUtility{} .process(meta_input); @@ -392,6 +392,14 @@ static int run_comparison_pipeline(ComparatorObject *self, // Build indexes upfront { + if (config.force_rebuild && !baseline_files.empty()) { + const std::string shared_index_path = + composites::dft::internal::determine_index_path( + baseline_files.front(), config.index_dir); + if (fs::exists(shared_index_path)) { + fs::remove_all(shared_index_path); + } + } std::unordered_set seen; std::vector all_files; for (const auto &f : baseline_files) { @@ -406,7 +414,7 @@ static int run_comparison_pipeline(ComparatorObject *self, idx_configs.push_back( indexer::IndexBuildConfig::for_file(file_path) .with_checkpoint_size(config.checkpoint_size) - .with_force_rebuild(config.force_rebuild) + .with_force_rebuild(false) .with_index_dir(config.index_dir)); } std::vector> idx_tasks; @@ -646,7 +654,7 @@ static const char *COMPARE_DOC = "(default 5000).\n" " threshold (float): Hide changes below this pct.\n" " executor_threads (int): Parallel threads (0=auto).\n" - " index_dir (str): Index sidecar directory.\n" + " index_dir (str): Directory for .dftindex stores.\n" " force_rebuild (bool): Force index rebuild.\n" " config (str): JSON config file path.\n" "\n" @@ -673,7 +681,7 @@ static const char *COMPARE_JSON_DOC = "(default 5000).\n" " threshold (float): Hide changes below this pct.\n" " executor_threads (int): Parallel threads (0=auto).\n" - " index_dir (str): Index sidecar directory.\n" + " index_dir (str): Directory for .dftindex stores.\n" " force_rebuild (bool): Force index rebuild.\n" " config (str): JSON config file path.\n" "\n" @@ -700,7 +708,7 @@ static const char *COMPARE_TABLE_DOC = "(default 5000).\n" " threshold (float): Hide changes below this pct.\n" " executor_threads (int): Parallel threads (0=auto).\n" - " index_dir (str): Index sidecar directory.\n" + " index_dir (str): Directory for .dftindex stores.\n" " force_rebuild (bool): Force index rebuild.\n" " config (str): JSON config file path.\n" "\n" diff --git a/src/dftracer/utils/python/utilities/metadata_collector.cpp b/src/dftracer/utils/python/utilities/metadata_collector.cpp index 529728a3..7776c399 100644 --- a/src/dftracer/utils/python/utilities/metadata_collector.cpp +++ b/src/dftracer/utils/python/utilities/metadata_collector.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -74,6 +75,7 @@ static PyObject *MetadataCollector_collect(MetadataCollectorObject *self, return NULL; std::string file_path_str(file_path); + std::string index_dir_str(index_dir); std::string error_msg; MetadataCollectorUtilityOutput output; @@ -82,7 +84,8 @@ static PyObject *MetadataCollector_collect(MetadataCollectorObject *self, MetadataCollectorUtilityInput input; input.file_path = file_path_str; - input.idx_path = file_path_str + ".idx"; + input.index_path = dftracer::utils::utilities::composites::dft:: + internal::determine_index_path(file_path_str, index_dir_str); auto *out_p = &output; auto input_copy = input; @@ -160,7 +163,7 @@ static PyObject *MetadataCollector_collect(MetadataCollectorObject *self, } while (0) SET_STR("file_path", output.file_path.c_str()); - SET_STR("idx_path", output.idx_path.c_str()); + SET_STR("index_path", output.index_path.c_str()); SET_DBL("size_mb", output.size_mb); SET_SZT("start_line", output.start_line); SET_SZT("end_line", output.end_line); @@ -199,7 +202,7 @@ static PyMethodDef MetadataCollector_methods[] = { "\n" "Args:\n" " file_path (str): Path to the trace file.\n" - " index_dir (str): Directory for index sidecars.\n"}, + " index_dir (str): Directory for .dftindex stores.\n"}, {NULL}}; PyTypeObject MetadataCollectorType = { @@ -231,7 +234,7 @@ PyTypeObject MetadataCollectorType = { "\n" "process(file_path, index_dir='') -> dict\n" " file_path (str): Path to the trace file.\n" - " index_dir (str): Directory for index sidecar files.\n", + " index_dir (str): Directory for .dftindex stores.\n", 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ diff --git a/src/dftracer/utils/python/utilities/reconstruction_planner.cpp b/src/dftracer/utils/python/utilities/reconstruction_planner.cpp index 47c6e30a..38100877 100644 --- a/src/dftracer/utils/python/utilities/reconstruction_planner.cpp +++ b/src/dftracer/utils/python/utilities/reconstruction_planner.cpp @@ -210,7 +210,7 @@ static PyMethodDef ReconstructionPlanner_methods[] = { "\n" "Args:\n" " reorganized_files (list[str]): Paths to reorganized files.\n" - " index_dir (str): Directory for index sidecars (default '').\n" + " index_dir (str): Directory for .dftindex stores (default '').\n" "\n" "Returns:\n" " dict: Reconstruction plan.\n"}, @@ -250,7 +250,7 @@ PyTypeObject ReconstructionPlannerType = { "\n" "process(reorganized_files, index_dir='') -> dict\n" " reorganized_files (list[str]): Paths to reorganized trace files.\n" - " index_dir (str): Directory containing provenance index sidecars.\n", + " index_dir (str): Directory containing `.dftindex` stores.\n", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ diff --git a/src/dftracer/utils/python/utilities/reorganization_planner.cpp b/src/dftracer/utils/python/utilities/reorganization_planner.cpp index 848f0bd5..929bfe79 100644 --- a/src/dftracer/utils/python/utilities/reorganization_planner.cpp +++ b/src/dftracer/utils/python/utilities/reorganization_planner.cpp @@ -178,8 +178,8 @@ static PyObject *ReorganizationPlanner_plan(ReorganizationPlannerObject *self, } PyDict_SetItemString(entry, "file_path", PyUnicode_FromString(sf.file_path.c_str())); - PyDict_SetItemString(entry, "idx_path", - PyUnicode_FromString(sf.idx_path.c_str())); + PyDict_SetItemString(entry, "index_path", + PyUnicode_FromString(sf.index_path.c_str())); PyDict_SetItemString(entry, "num_checkpoints", PyLong_FromSize_t(sf.num_checkpoints)); PyDict_SetItemString(entry, "uncompressed_size", @@ -254,7 +254,7 @@ static PyMethodDef ReorganizationPlanner_methods[] = { " source_files (list[str]): Paths to source trace files.\n" " groups (list[dict] or None): Predicate group definitions\n" " (default None).\n" - " index_dir (str): Directory for index sidecars (default '').\n" + " index_dir (str): Directory for .dftindex stores (default '').\n" "\n" "Returns:\n" " dict: Extraction plan.\n"}, diff --git a/src/dftracer/utils/python/utilities/statistics_aggregator.cpp b/src/dftracer/utils/python/utilities/statistics_aggregator.cpp index 8ea1ab4b..660d6149 100644 --- a/src/dftracer/utils/python/utilities/statistics_aggregator.cpp +++ b/src/dftracer/utils/python/utilities/statistics_aggregator.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -83,7 +84,8 @@ static PyObject *StatisticsAggregator_compute(StatisticsAggregatorObject *self, StatisticsAggregatorInput input; input.file_path = file_path_str; input.index_dir = index_dir_str; - input.idx_path = file_path_str + ".idx"; + input.index_path = dftracer::utils::utilities::composites::dft:: + internal::determine_index_path(file_path_str, index_dir_str); auto *stats_p = &stats; auto input_copy = input; @@ -199,7 +201,7 @@ static PyMethodDef StatisticsAggregator_methods[] = { "\n" "Args:\n" " file_path (str): Path to the trace file.\n" - " index_dir (str): Directory for index sidecars (default '').\n" + " index_dir (str): Directory for .dftindex stores (default '').\n" "\n" "Returns:\n" " dict: Aggregated statistics.\n"}, diff --git a/src/dftracer/utils/python/utilities/statistics_query.cpp b/src/dftracer/utils/python/utilities/statistics_query.cpp index a7e27265..95a6c173 100644 --- a/src/dftracer/utils/python/utilities/statistics_query.cpp +++ b/src/dftracer/utils/python/utilities/statistics_query.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -117,7 +118,8 @@ static PyObject *StatisticsQuery_query(StatisticsQueryObject *self, StatisticsAggregatorInput agg_input; agg_input.file_path = file_path_str; agg_input.index_dir = index_dir_str; - agg_input.idx_path = file_path_str + ".idx"; + agg_input.index_path = dftracer::utils::utilities::composites::dft:: + internal::determine_index_path(file_path_str, index_dir_str); auto *stats_p = &stats; auto agg_task = [stats_p, agg_input]() -> CoroTask { @@ -251,7 +253,7 @@ static PyMethodDef StatisticsQuery_methods[] = { " 'time_range', 'duration_stats', 'top_n_names',\n" " 'top_n_categories', 'detailed'.\n" " top_n (int): Top results for ranked queries (default 10).\n" - " index_dir (str): Directory for index sidecars (default '').\n" + " index_dir (str): Directory for .dftindex stores (default '').\n" "\n" "Returns:\n" " dict: Query results.\n"}, diff --git a/src/dftracer/utils/server/trace_api.cpp b/src/dftracer/utils/server/trace_api.cpp index 9663ed24..1b8f3ca0 100644 --- a/src/dftracer/utils/server/trace_api.cpp +++ b/src/dftracer/utils/server/trace_api.cpp @@ -76,7 +76,7 @@ static const std::unordered_set HASH_METADATA_NAMES = {"FH", "HH", using dftracer::utils::utilities::common::json::JsonDocGuard; using dftracer::utils::utilities::common::query::Query; -/// Direct-scan a small file without any sidecar index. +/// Direct-scan a small file without any `.dftindex` store. /// Streams via async_streaming_gz_lines(), parses JSON, applies /// predicate filters, collects matching events as raw JSON strings. static coro::CoroTask direct_scan_events( @@ -413,7 +413,8 @@ static coro::AsyncGenerator stream_events( ViewBuilderInput builder_input; builder_input.with_view(ev_view) .with_file_path(file_info->path) - .with_idx_path(file_info->has_bloom_data ? file_info->idx_path : "") + .with_index_path(file_info->has_bloom_data ? file_info->index_path + : "") .with_uncompressed_size(file_info->uncompressed_size) .with_num_checkpoints(file_info->num_checkpoints) .with_bloom_cache(bloom_cache) @@ -428,7 +429,7 @@ static coro::AsyncGenerator stream_events( ViewReaderInput reader_input; reader_input.with_file_path(file_info->path) - .with_idx_path(file_info->idx_path) + .with_index_path(file_info->index_path) .with_byte_range(candidate.start_byte, candidate.end_byte) .with_checkpoint_idx(candidate.checkpoint_idx) .with_view(ev_view); @@ -529,7 +530,7 @@ static coro::CoroTask handle_stats(const HttpRequest& req, for (auto* file_info : stat_files) { StatisticsAggregatorInput agg_input; agg_input.file_path = file_info->path; - agg_input.idx_path = file_info->idx_path; + agg_input.index_path = file_info->index_path; agg_input.index_dir = index.index_dir(); StatisticsAggregatorUtility aggregator; @@ -569,7 +570,7 @@ static coro::CoroTask handle_stats(const HttpRequest& req, StatisticsAggregatorInput agg_input; agg_input.file_path = file_info->path; - agg_input.idx_path = file_info->idx_path; + agg_input.index_path = file_info->index_path; agg_input.index_dir = *index_dir_ptr; StatisticsAggregatorUtility aggregator; diff --git a/src/dftracer/utils/server/trace_index.cpp b/src/dftracer/utils/server/trace_index.cpp index e521e3c7..9ccde5d3 100644 --- a/src/dftracer/utils/server/trace_index.cpp +++ b/src/dftracer/utils/server/trace_index.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -50,7 +51,7 @@ coro::CoroTask TraceIndex::initialize() { for (const auto& entry : entries) { FileInfo info; info.path = entry.path.string(); - info.idx_path = internal::determine_index_path(info.path, index_dir_); + info.index_path = internal::determine_index_path(info.path, index_dir_); std::error_code ec; auto fsize = fs::file_size(info.path, ec); @@ -68,8 +69,8 @@ coro::CoroTask TraceIndex::initialize() { static_cast(info.compressed_size) / (1024.0 * 1024.0); small_count++; } else { - info.has_bloom_data = fs::exists(info.idx_path); - info.has_checkpoint_index = fs::exists(info.idx_path); + info.has_bloom_data = fs::exists(info.index_path); + info.has_checkpoint_index = fs::exists(info.index_path); if (!info.has_bloom_data) { needs_build.push_back(idx); } else { @@ -83,7 +84,7 @@ coro::CoroTask TraceIndex::initialize() { if (small_count > 0) { DFTRACER_UTILS_LOG_INFO( "TraceIndex: %zu small file(s) (< %zu bytes) will be " - "streamed directly (no sidecar indexes)", + "streamed directly (no .dftindex database)", small_count, INDEX_SIZE_THRESHOLD); } @@ -155,12 +156,12 @@ coro::CoroTask TraceIndex::initialize() { co_await builder.process(config); if (result.success) { - info->idx_path = + info->index_path = internal::determine_index_path( info->path, *index_dir_ptr); info->has_bloom_data = true; info->has_checkpoint_index = - fs::exists(info->idx_path); + fs::exists(info->index_path); } else { DFTRACER_UTILS_LOG_WARN( "TraceIndex: failed to " @@ -209,29 +210,43 @@ coro::CoroTask TraceIndex::initialize() { if (info->has_bloom_data) { try { - indexer::IndexDatabase idx_db( - info->idx_path); - auto logical = indexer::internal:: - get_logical_path(info->path); - int fid = idx_db.get_file_info_id( - logical); - if (fid >= 0) { - auto tb = - idx_db.query_time_bounds( - fid); - if (tb.valid) { - info->min_timestamp_us = - tb.min_timestamp_us; - info->max_timestamp_us = - tb.max_timestamp_us; - } + const std::string path = info->path; + const std::string index_path = + info->index_path; + const auto* path_ptr = &path; + const auto* index_path_ptr = + &index_path; + auto bounds = co_await rocksdb::run( + [path_ptr, index_path_ptr] { + indexer::IndexDatabase + idx_db(*index_path_ptr); + auto logical = + indexer::internal:: + get_logical_path( + *path_ptr); + int fid = + idx_db.get_file_info_id( + logical); + if (fid < 0) { + return indexer:: + IndexDatabase:: + TimeBounds{}; + } + return idx_db + .query_time_bounds(fid); + }); + if (bounds.valid) { + info->min_timestamp_us = + bounds.min_timestamp_us; + info->max_timestamp_us = + bounds.max_timestamp_us; } } catch (const std::exception& e) { DFTRACER_UTILS_LOG_WARN( "TraceIndex: failed to " "read time bounds from " "%s: %s", - info->idx_path.c_str(), + info->index_path.c_str(), e.what()); } } @@ -239,7 +254,7 @@ coro::CoroTask TraceIndex::initialize() { auto meta_input = MetadataCollectorUtilityInput:: from_file(info->path) - .with_index(info->idx_path); + .with_index(info->index_path); auto metadata = co_await MetadataCollectorUtility{} .process(meta_input); diff --git a/src/dftracer/utils/server/viz_api.cpp b/src/dftracer/utils/server/viz_api.cpp index 5df0efbb..9917c765 100644 --- a/src/dftracer/utils/server/viz_api.cpp +++ b/src/dftracer/utils/server/viz_api.cpp @@ -209,7 +209,7 @@ static void apply_filters(std::string& dsl, std::string_view filters_str) { } } -/// Direct-scan a small file without any sidecar index. +/// Direct-scan a small file without any `.dftindex` store. /// Streams via async_streaming_gz_lines(), parses JSON, applies /// predicate filters, collects matching events as raw JSON strings. static coro::CoroTask direct_scan_events( @@ -447,8 +447,8 @@ static coro::CoroTask handle_viz_events( ViewBuilderInput builder_input; builder_input.with_view(view) .with_file_path(file_info->path) - .with_idx_path( - file_info->has_bloom_data ? file_info->idx_path : "") + .with_index_path( + file_info->has_bloom_data ? file_info->index_path : "") .with_uncompressed_size(file_info->uncompressed_size) .with_num_checkpoints(file_info->num_checkpoints) .with_bloom_cache(&index.bloom_cache()) @@ -467,7 +467,7 @@ static coro::CoroTask handle_viz_events( } ViewReaderInput reader_input; reader_input.with_file_path(file_info->path) - .with_idx_path(file_info->idx_path) + .with_index_path(file_info->index_path) .with_byte_range(candidate.start_byte, candidate.end_byte) .with_checkpoint_idx(candidate.checkpoint_idx) @@ -555,9 +555,9 @@ static coro::CoroTask handle_viz_events( ViewBuilderInput builder_input; builder_input.with_view(*view_ptr) .with_file_path(file_info->path) - .with_idx_path(file_info->has_bloom_data - ? file_info->idx_path - : "") + .with_index_path(file_info->has_bloom_data + ? file_info->index_path + : "") .with_uncompressed_size( file_info->uncompressed_size) .with_num_checkpoints(file_info->num_checkpoints) @@ -577,7 +577,7 @@ static coro::CoroTask handle_viz_events( ViewReaderInput reader_input; reader_input.with_file_path(file_info->path) - .with_idx_path(file_info->idx_path) + .with_index_path(file_info->index_path) .with_byte_range(candidate.start_byte, candidate.end_byte) .with_checkpoint_idx(candidate.checkpoint_idx) diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp index 98bfbaca..a81d2c23 100644 --- a/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp +++ b/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -300,7 +301,7 @@ bool TraceReader::read_with_reader(const std::string& trace_file, auto format = dftracer::utils::FormatDetector::detect(trace_file); // For GZIP files, skip Reader API and use direct zlib decompression - // since Reader API requires .idx files + // since this path expects a prebuilt `.dftindex` store. if (format == dftracer::utils::ArchiveFormat::GZIP) { return false; // Will trigger fallback to read_direct which handles // gzip @@ -313,13 +314,13 @@ bool TraceReader::read_with_reader(const std::string& trace_file, return false; } - // Generate index file path - std::string idx_file = trace_file + ".idx"; + std::string index_path = dftracer::utils::utilities::composites::dft:: + internal::determine_index_path(trace_file, ""); // Create reader (this will auto-build index if needed) auto reader = dftracer::utils::utilities::reader::internal::ReaderFactory::create( - trace_file, idx_file); + trace_file, index_path); if (!reader || !reader->is_valid()) { DFTRACER_UTILS_LOG_ERROR("Failed to create reader for %s", trace_file.c_str()); @@ -881,4 +882,4 @@ void CallTree::print_calls_recursive(const ProcessCallTree& graph, } } // namespace internal -} // namespace dftracer::utils::call_tree \ No newline at end of file +} // namespace dftracer::utils::call_tree diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp index ff7ea5e9..8a6fe6de 100644 --- a/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp +++ b/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp @@ -241,10 +241,11 @@ bool MPIFilteredTraceReader::read(const std::string& trace_file, ArchiveFormat format = FormatDetector::detect(trace_file); if (format == ArchiveFormat::GZIP) { - // Try to use indexer - std::string idx_file = trace_file + ".idx"; - if (fs::exists(idx_file)) { - return read_with_indexer(trace_file, idx_file, graph); + std::string index_path = + utilities::composites::dft::internal::determine_index_path( + trace_file, ""); + if (fs::exists(index_path)) { + return read_with_indexer(trace_file, index_path, graph); } } @@ -610,12 +611,14 @@ std::set MPICallTreeBuilder::scan_file_for_pids( // Check if it's a gzip file with an index ArchiveFormat format = FormatDetector::detect(trace_file); - std::string idx_file = trace_file + ".idx"; + std::string index_path = + utilities::composites::dft::internal::determine_index_path(trace_file, + ""); - if (format == ArchiveFormat::GZIP && fs::exists(idx_file)) { + if (format == ArchiveFormat::GZIP && fs::exists(index_path)) { try { auto reader = utilities::reader::internal::ReaderFactory::create( - trace_file, idx_file); + trace_file, index_path); if (reader && reader->is_valid()) { // Read first N lines to discover PIDs std::size_t num_lines = reader->get_num_lines(); diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp index c4bec8af..465d4029 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp @@ -257,6 +257,15 @@ coro::AsyncGenerator AggregatorUtility::process( EventAggregatorUtility merger; std::atomic global_chunk_idx{0}; + if (input.force_rebuild && !input_files.empty()) { + const std::string shared_index_path = + composites::dft::internal::determine_index_path( + input_files.front(), effective_index_dir); + if (fs::exists(shared_index_path)) { + fs::remove_all(shared_index_path); + } + } + for (const auto& file_path : input_files) { bool is_compressed = file_path.size() >= 3 && @@ -268,7 +277,7 @@ coro::AsyncGenerator AggregatorUtility::process( file_path, effective_index_dir); auto idx_input = indexer::IndexBuildConfig::for_file(file_path) .with_checkpoint_size(input.checkpoint_size) - .with_force_rebuild(input.force_rebuild) + .with_force_rebuild(false) .with_index_dir(effective_index_dir); co_await indexer::IndexBuilderUtility{}.process(idx_input); } @@ -277,7 +286,7 @@ coro::AsyncGenerator AggregatorUtility::process( auto meta_input = composites::dft::MetadataCollectorUtilityInput::from_file(file_path) .with_checkpoint_size(input.checkpoint_size) - .with_force_rebuild(input.force_rebuild) + .with_force_rebuild(false) .with_index(idx_path); auto metadata = co_await composites::dft::MetadataCollectorUtility{}.process( diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp index c156263c..4618338d 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp @@ -206,9 +206,9 @@ coro::CoroTask ChunkAggregatorUtility::process( TraceReaderConfig reader_cfg; reader_cfg.file_path = input.file_path; - if (!input.idx_path.empty()) { + if (!input.index_path.empty()) { reader_cfg.index_dir = - input.idx_path.substr(0, input.idx_path.rfind('/')); + input.index_path.substr(0, input.index_path.rfind('/')); } reader_cfg.checkpoint_size = input.checkpoint_size; TraceReader trace_reader(reader_cfg); diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_mapper_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_mapper_utility.cpp index 085e95d1..a516fa0f 100644 --- a/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_mapper_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_mapper_utility.cpp @@ -28,7 +28,7 @@ coro::CoroTask FileChunkMapperUtility::process( FileChunkMapperOutput chunks; ChunkAggregatorInput chunk; chunk.with_file_path(meta.file_path) - .with_idx_path(meta.idx_path) + .with_index_path(meta.index_path) .with_byte_range(0, 0) .with_line_range(0, 0) .with_chunk_index(input.start_chunk_index) @@ -65,7 +65,7 @@ coro::CoroTask FileChunkMapperUtility::process( ChunkAggregatorInput chunk; chunk.with_file_path(meta.file_path) - .with_idx_path(meta.idx_path) + .with_index_path(meta.index_path) .with_byte_range(start_byte, end_byte) .with_line_range(start_line, end_line) .with_chunk_index(input.start_chunk_index + static_cast(i)) diff --git a/src/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.cpp b/src/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.cpp index f5931984..f3c752c9 100644 --- a/src/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.cpp @@ -60,7 +60,7 @@ ChunkExtractorUtility::extract_and_write( auto reader_config = StreamingLineReaderConfig() .with_file(spec.file_path) - .with_index(spec.idx_path) + .with_index(spec.index_path) .with_line_range(spec.start_line, spec.end_line); auto line_gen = StreamingLineReader::read_async(reader_config); @@ -84,9 +84,9 @@ ChunkExtractorUtility::extract_and_write( } } } else { - if (!spec.idx_path.empty()) { + if (!spec.index_path.empty()) { auto reader = reader::internal::ReaderFactory::create( - spec.file_path, spec.idx_path); + spec.file_path, spec.index_path); auto line_gen = sources::async_indexed_file_bytes( reader, spec.start_byte, spec.end_byte); diff --git a/src/dftracer/utils/utilities/composites/dft/chunk_manifest_mapper_utility.cpp b/src/dftracer/utils/utilities/composites/dft/chunk_manifest_mapper_utility.cpp index af440ec2..b0261dd5 100644 --- a/src/dftracer/utils/utilities/composites/dft/chunk_manifest_mapper_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/chunk_manifest_mapper_utility.cpp @@ -78,7 +78,7 @@ ChunkManifestMapperUtility::process( static_cast(line_end - file.start_line + 1) * bytes_per_line); - internal::DFTracerChunkSpec spec(file.file_path, file.idx_path, + internal::DFTracerChunkSpec spec(file.file_path, file.index_path, size_to_take, start_byte, end_byte, current_start, line_end); diff --git a/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp b/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp index 84743909..5a6ebee1 100644 --- a/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp @@ -95,10 +95,10 @@ EventCollectorFromMetadataUtility::process( #endif EventIdCollector collector(events, input.trim_commas); - if (!file.idx_path.empty()) { + if (!file.index_path.empty()) { // Indexed/compressed file auto reader = reader::internal::ReaderFactory::create( - file.file_path, file.idx_path); + file.file_path, file.index_path); if (!reader) { DFTRACER_UTILS_LOG_ERROR("Failed to create reader for file: %s", file.file_path.c_str()); diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp index 13e18d43..8f9ce32a 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp @@ -153,7 +153,7 @@ coro::CoroTask ChunkIndexerUtility::process( auto reader_input = composites::IndexedReadInput::from_file(input.file_path) .with_checkpoint_size(input.checkpoint_size) - .with_index(input.idx_path); + .with_index(input.index_path); composites::IndexedFileReaderUtility reader_utility; reader = co_await reader_utility.process(reader_input); diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp index 051e95b6..b30ecbf7 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include @@ -48,11 +48,11 @@ struct PrunerContext { // Hash resolution: human-readable value → hash strings std::unordered_map> hash_cache; - const sqlite::SqliteDatabase* db = nullptr; + const IndexDatabase* db = nullptr; int fid = -1; BloomFilterCache* cache; - std::string idx_path; + std::string index_path; // Resolve a value for a hash dimension. // Returns the hash strings if the dimension is a hash dim and @@ -67,7 +67,7 @@ struct PrunerContext { if (it != hash_cache.end()) return it->second; if (db) { - auto hashes = queries::query_hash_by_resolved(*db, dim, val); + auto hashes = db->query_hash_by_resolved(dim, val); auto& cached = hash_cache[key]; cached = std::move(hashes); return cached; @@ -347,7 +347,7 @@ coro::CoroTask ChunkPrunerUtility::process( out.file_may_match = false; try { - IndexDatabase idx_db(input.idx_path); + IndexDatabase idx_db(input.index_path); int fid = idx_db.get_file_info_id(get_logical_path(input.file_path)); if (fid < 0) { @@ -357,14 +357,13 @@ coro::CoroTask ChunkPrunerUtility::process( } // Load chunk dimension stats - auto dim_stats_rows = - queries::query_chunk_dimension_stats(idx_db.sql_db(), fid); + auto dim_stats_rows = idx_db.query_chunk_dimension_stats(fid); PrunerContext ctx; ctx.file_info_id = fid; ctx.cache = input.cache; - ctx.idx_path = input.idx_path; - ctx.db = &idx_db.sql_db(); + ctx.index_path = input.index_path; + ctx.db = &idx_db; ctx.fid = fid; for (const auto& ds : dim_stats_rows) { @@ -373,10 +372,9 @@ coro::CoroTask ChunkPrunerUtility::process( } // Load bloom filters for all dimensions - auto indexed_dims = - queries::query_index_dimensions(idx_db.sql_db(), fid); - auto all_chunk_blooms = queries::query_chunk_bloom_filters_batch( - idx_db.sql_db(), fid, indexed_dims); + auto indexed_dims = idx_db.query_index_dimensions(fid); + auto all_chunk_blooms = + idx_db.query_chunk_bloom_filters_batch(fid, indexed_dims); for (const auto& [dim, chunk_blooms] : all_chunk_blooms) { for (const auto& cb : chunk_blooms) { @@ -384,8 +382,8 @@ coro::CoroTask ChunkPrunerUtility::process( BloomFilter bf = BloomFilter::from_blob( cb.bloom_data.data(), cb.bloom_data.size()); if (input.cache) { - input.cache->put(input.idx_path, dim, cb.checkpoint_idx, - bf); + input.cache->put(input.index_path, dim, + cb.checkpoint_idx, bf); } ctx.bloom_filters[dim][cb.checkpoint_idx] = std::move(bf); } @@ -428,7 +426,7 @@ coro::CoroTask ChunkPrunerUtility::process( return out; }; - co_return co_await sqlite::run(do_query); + co_return co_await rocksdb::run(do_query); } } // namespace dftracer::utils::utilities::composites::dft::indexing diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp index 9ee51be6..6aa594a9 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp +++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include namespace dftracer::utils::utilities::composites::dft::indexing { @@ -15,11 +16,18 @@ void ChunkStatistics::update_from_event(std::string_view name, std::uint64_t dur) { ++total_events; - // 20 digits max per uint64 + ':' separator = 41 chars max - char pt_buf[52]; + constexpr std::size_t pid_tid_buf_size = + (2 * std::numeric_limits::digits10) + 3; + char pt_buf[pid_tid_buf_size]; auto [pp, ec1] = std::to_chars(pt_buf, pt_buf + sizeof(pt_buf), pid); + if (ec1 != std::errc{} || pp == pt_buf + sizeof(pt_buf)) { + throw std::runtime_error("failed to format pid"); + } *pp++ = ':'; auto [tp, ec2] = std::to_chars(pp, pt_buf + sizeof(pt_buf), tid); + if (ec2 != std::errc{}) { + throw std::runtime_error("failed to format tid"); + } std::string_view pt_sv(pt_buf, tp - pt_buf); category_counts[std::string(cat)]++; diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_bloom_filters.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_bloom_filters.cpp deleted file mode 100644 index 45c6b7b8..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_bloom_filters.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void delete_chunk_bloom_filters(const SqliteDatabase& db, int file_info_id, - std::string_view dimension) { - SqliteStmt stmt(db, - "DELETE FROM chunk_bloom_filters " - "WHERE file_info_id = ? AND dimension = ?;"); - - stmt.bind_int(1, file_info_id); - stmt.bind_text(2, dimension); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to delete chunk bloom filters: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_dimension_stats.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_dimension_stats.cpp deleted file mode 100644 index f804a3ff..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_dimension_stats.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void delete_chunk_dimension_stats(const SqliteDatabase& db, int file_info_id) { - SqliteStmt stmt( - db, "DELETE FROM chunk_dimension_stats WHERE file_info_id = ?;"); - stmt.bind_int(1, file_info_id); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to delete chunk dimension stats: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_statistics.cpp deleted file mode 100644 index 801ac157..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_chunk_statistics.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void delete_chunk_statistics(const SqliteDatabase& db, int file_info_id) { - SqliteStmt stmt(db, "DELETE FROM chunk_statistics WHERE file_info_id = ?;"); - - stmt.bind_int(1, file_info_id); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to delete chunk statistics: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_event_ranges.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_event_ranges.cpp deleted file mode 100644 index f7228e0f..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_event_ranges.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void delete_event_ranges(const SqliteDatabase& db, int file_info_id) { - SqliteStmt stmt(db, - "DELETE FROM checkpoint_event_ranges " - "WHERE file_info_id = ?;"); - stmt.bind_int(1, file_info_id); - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to delete event ranges: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace - // dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_file_bloom_filter.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_file_bloom_filter.cpp deleted file mode 100644 index ca480c1d..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_file_bloom_filter.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void delete_file_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::string_view dimension) { - SqliteStmt stmt(db, - "DELETE FROM file_bloom_filters " - "WHERE file_info_id = ? AND dimension = ?;"); - - stmt.bind_int(1, file_info_id); - stmt.bind_text(2, dimension); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to delete file bloom filter: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_hash_resolutions.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_hash_resolutions.cpp deleted file mode 100644 index 5b5a58ad..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_hash_resolutions.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void delete_hash_resolutions(const SqliteDatabase& db, int file_info_id) { - SqliteStmt stmt(db, "DELETE FROM hash_resolutions WHERE file_info_id = ?;"); - - stmt.bind_int(1, file_info_id); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to delete hash resolutions: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_metadata_lines.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_metadata_lines.cpp deleted file mode 100644 index 8972a08f..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/delete_metadata_lines.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void delete_metadata_lines(const SqliteDatabase& db, int file_info_id) { - SqliteStmt stmt(db, - "DELETE FROM checkpoint_metadata_lines " - "WHERE file_info_id = ?;"); - stmt.bind_int(1, file_info_id); - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to delete metadata lines: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace - // dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_bloom_filter.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_bloom_filter.cpp deleted file mode 100644 index c389141d..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_bloom_filter.cpp +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include - -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void insert_chunk_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view dimension, - const void* blob_data, int blob_size, - std::uint64_t num_entries) { - SqliteStmt stmt( - db, - "INSERT OR REPLACE INTO chunk_bloom_filters" - "(file_info_id, checkpoint_idx, dimension, bloom_data, num_entries) " - "VALUES(?, ?, ?, ?, ?);"); - - stmt.bind_int(1, file_info_id); - stmt.bind_int64(2, static_cast(checkpoint_idx)); - stmt.bind_text(3, dimension); - stmt.bind_blob(4, blob_data, blob_size); - stmt.bind_int64(5, static_cast(num_entries)); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert chunk bloom filter: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -void insert_chunk_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view dimension, - std::span blob_data, - std::uint64_t num_entries) { - insert_chunk_bloom_filter(db, file_info_id, checkpoint_idx, dimension, - blob_data.data(), - static_cast(blob_data.size()), num_entries); -} - -SqliteStmt prepare_insert_chunk_bloom_filter(const SqliteDatabase& db) { - return SqliteStmt( - db, - "INSERT OR REPLACE INTO chunk_bloom_filters" - "(file_info_id, checkpoint_idx, dimension, bloom_data, num_entries) " - "VALUES(?, ?, ?, ?, ?);"); -} - -void insert_chunk_bloom_filter(SqliteStmt& stmt, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view dimension, - const void* blob_data, int blob_size, - std::uint64_t num_entries) { - stmt.reset(); - stmt.bind_int(1, file_info_id); - stmt.bind_int64(2, static_cast(checkpoint_idx)); - stmt.bind_text_static(3, dimension); - stmt.bind_blob_static(4, blob_data, blob_size); - stmt.bind_int64(5, static_cast(num_entries)); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert chunk bloom filter"); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_dimension_stats.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_dimension_stats.cpp deleted file mode 100644 index 453612e6..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_dimension_stats.cpp +++ /dev/null @@ -1,84 +0,0 @@ -#include -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void insert_chunk_dimension_stats(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - const ChunkDimensionStats& stats, - std::size_t value_counts_cap) { - SqliteStmt stmt(db, - "INSERT OR REPLACE INTO chunk_dimension_stats" - "(file_info_id, checkpoint_idx, dimension, distinct_count, " - "value_counts, min_value, max_value, value_type) " - "VALUES(?, ?, ?, ?, ?, ?, ?, ?);"); - - stmt.bind_int(1, file_info_id); - stmt.bind_int64(2, static_cast(checkpoint_idx)); - stmt.bind_text_static(3, stats.dimension); - stmt.bind_int64(4, static_cast(stats.distinct_count)); - - auto compressed = stats.compress_value_counts(value_counts_cap); - if (compressed) { - stmt.bind_blob_static(5, compressed->data(), - static_cast(compressed->size())); - } else { - stmt.bind_null(5); - } - - stmt.bind_text_static(6, stats.min_value); - stmt.bind_text_static(7, stats.max_value); - stmt.bind_text_static(8, stats.value_type); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert chunk dimension stats: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -SqliteStmt prepare_insert_chunk_dimension_stats(const SqliteDatabase& db) { - return SqliteStmt( - db, - "INSERT OR REPLACE INTO chunk_dimension_stats" - "(file_info_id, checkpoint_idx, dimension, distinct_count, " - "value_counts, min_value, max_value, value_type) " - "VALUES(?, ?, ?, ?, ?, ?, ?, ?);"); -} - -void insert_chunk_dimension_stats(SqliteStmt& stmt, int file_info_id, - std::uint64_t checkpoint_idx, - const ChunkDimensionStats& stats, - std::size_t value_counts_cap) { - stmt.reset(); - stmt.bind_int(1, file_info_id); - stmt.bind_int64(2, static_cast(checkpoint_idx)); - stmt.bind_text_static(3, stats.dimension); - stmt.bind_int64(4, static_cast(stats.distinct_count)); - - auto compressed = stats.compress_value_counts(value_counts_cap); - if (compressed) { - stmt.bind_blob_static(5, compressed->data(), - static_cast(compressed->size())); - } else { - stmt.bind_null(5); - } - - stmt.bind_text_static(6, stats.min_value); - stmt.bind_text_static(7, stats.max_value); - stmt.bind_text_static(8, stats.value_type); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert chunk dimension stats"); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_statistics.cpp deleted file mode 100644 index ddca1618..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_chunk_statistics.cpp +++ /dev/null @@ -1,86 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void insert_chunk_statistics(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - const ChunkStatistics& stats) { - SqliteStmt stmt( - db, - "INSERT OR REPLACE INTO chunk_statistics" - "(file_info_id, checkpoint_idx, total_events, " - "min_timestamp_us, max_timestamp_us, " - "duration_sum_us, duration_min_us, duration_max_us, duration_count, " - "duration_m2, duration_sketch, duration_histogram, " - "name_duration_sketches, name_duration_histograms, " - "name_duration_sums, name_duration_sum_sqs, name_category) " - "VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"); - - stmt.bind_int(1, file_info_id); - stmt.bind_int64(2, static_cast(checkpoint_idx)); - stmt.bind_int64(3, static_cast(stats.total_events)); - - if (stats.min_timestamp_us != std::numeric_limits::max()) { - stmt.bind_int64(4, static_cast(stats.min_timestamp_us)); - } else { - stmt.bind_null(4); - } - - if (stats.max_timestamp_us != 0) { - stmt.bind_int64(5, static_cast(stats.max_timestamp_us)); - } else { - stmt.bind_null(5); - } - - stmt.bind_int64(6, stats.duration_sum_us); - - if (stats.duration_min_us != std::numeric_limits::max()) { - stmt.bind_int64(7, static_cast(stats.duration_min_us)); - } else { - stmt.bind_null(7); - } - - if (stats.duration_max_us != 0) { - stmt.bind_int64(8, static_cast(stats.duration_max_us)); - } else { - stmt.bind_null(8); - } - - stmt.bind_int64(9, static_cast(stats.duration_count)); - stmt.bind_double(10, stats.duration_m2); - - if (!stats.duration_sketch.empty()) { - auto blob = stats.duration_sketch.serialize(); - stmt.bind_blob(11, blob.data(), static_cast(blob.size())); - } else { - stmt.bind_null(11); - } - - stmt.bind_text(12, stats.duration_histogram.to_json()); - - if (!stats.name_duration_sketches.empty()) { - auto blob = stats.serialize_name_duration_sketches(); - stmt.bind_blob(13, blob.data(), static_cast(blob.size())); - } else { - stmt.bind_null(13); - } - - stmt.bind_text(14, stats.name_duration_histograms_json()); - stmt.bind_text(15, stats.name_duration_sums_json()); - stmt.bind_text(16, stats.name_duration_sum_sqs_json()); - stmt.bind_text(17, stats.name_category_json()); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert chunk statistics: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_event_range.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_event_range.cpp deleted file mode 100644 index f44fcf70..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_event_range.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -std::vector pack_line_numbers( - const std::vector& lines) { - std::vector blob(lines.size() * sizeof(std::uint32_t)); - std::memcpy(blob.data(), lines.data(), blob.size()); - return blob; -} - -std::vector unpack_line_numbers(const unsigned char* data, - std::size_t size) { - std::size_t count = size / sizeof(std::uint32_t); - std::vector lines(count); - std::memcpy(lines.data(), data, size); - return lines; -} - -void insert_event_range(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, std::string_view cat, - std::string_view name, - const std::vector& line_numbers) { - auto blob = pack_line_numbers(line_numbers); - - SqliteStmt stmt(db, - "INSERT OR REPLACE INTO checkpoint_event_ranges" - "(checkpoint_idx, file_info_id, cat, name, " - "line_numbers, event_count) " - "VALUES(?, ?, ?, ?, ?, ?);"); - - stmt.bind_int64(1, static_cast(checkpoint_idx)); - stmt.bind_int(2, file_info_id); - stmt.bind_text(3, cat); - stmt.bind_text(4, name); - stmt.bind_blob(5, blob.data(), static_cast(blob.size())); - stmt.bind_int64(6, static_cast(line_numbers.size())); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert event range: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -void insert_event_range(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, std::string_view cat, - std::string_view name, - std::span line_numbers) { - std::vector vec(line_numbers.begin(), line_numbers.end()); - insert_event_range(db, file_info_id, checkpoint_idx, cat, name, vec); -} - -} // namespace - // dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_file_bloom_filter.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_file_bloom_filter.cpp deleted file mode 100644 index cf7c0ff8..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_file_bloom_filter.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include -#include - -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void insert_file_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::string_view dimension, const void* blob_data, - int blob_size, std::uint64_t num_entries) { - SqliteStmt stmt(db, - "INSERT OR REPLACE INTO file_bloom_filters" - "(file_info_id, dimension, bloom_data, num_entries) " - "VALUES(?, ?, ?, ?);"); - - stmt.bind_int(1, file_info_id); - stmt.bind_text(2, dimension); - stmt.bind_blob(3, blob_data, blob_size); - stmt.bind_int64(4, static_cast(num_entries)); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert file bloom filter: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -void insert_file_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::string_view dimension, - std::span blob_data, - std::uint64_t num_entries) { - insert_file_bloom_filter(db, file_info_id, dimension, blob_data.data(), - static_cast(blob_data.size()), num_entries); -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_hash_resolution.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_hash_resolution.cpp deleted file mode 100644 index f96fdb99..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_hash_resolution.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void insert_hash_resolution(const SqliteDatabase& db, int file_info_id, - std::string_view dimension, - std::string_view hash_value, - std::string_view resolved_value) { - SqliteStmt stmt(db, - "INSERT OR IGNORE INTO hash_resolutions" - "(file_info_id, dimension, hash_value, resolved_value) " - "VALUES(?, ?, ?, ?);"); - - stmt.bind_int(1, file_info_id); - stmt.bind_text(2, dimension); - stmt.bind_text(3, hash_value); - stmt.bind_text(4, resolved_value); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert hash resolution: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -SqliteStmt prepare_insert_hash_resolution(const SqliteDatabase& db) { - return SqliteStmt(db, - "INSERT OR IGNORE INTO hash_resolutions" - "(file_info_id, dimension, hash_value, resolved_value) " - "VALUES(?, ?, ?, ?);"); -} - -void insert_hash_resolution(SqliteStmt& stmt, int file_info_id, - std::string_view dimension, - std::string_view hash_value, - std::string_view resolved_value) { - stmt.reset(); - stmt.bind_int(1, file_info_id); - stmt.bind_text_static(2, dimension); - stmt.bind_text_static(3, hash_value); - stmt.bind_text_static(4, resolved_value); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert hash resolution"); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_index_dimension.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_index_dimension.cpp deleted file mode 100644 index 48801f10..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_index_dimension.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void insert_index_dimension(const SqliteDatabase& db, int file_info_id, - std::string_view dimension) { - SqliteStmt stmt(db, - "INSERT OR IGNORE INTO index_dimensions" - "(file_info_id, dimension) VALUES(?, ?);"); - - stmt.bind_int(1, file_info_id); - stmt.bind_text(2, dimension); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert index dimension: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_metadata_lines.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_metadata_lines.cpp deleted file mode 100644 index a13546d7..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_metadata_lines.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include -#include -#include - -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void insert_metadata_lines(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view meta_type, - const std::vector& line_numbers) { - auto blob = pack_line_numbers(line_numbers); - - SqliteStmt stmt(db, - "INSERT OR REPLACE INTO checkpoint_metadata_lines" - "(checkpoint_idx, file_info_id, meta_type, " - "line_numbers) " - "VALUES(?, ?, ?, ?);"); - - stmt.bind_int64(1, static_cast(checkpoint_idx)); - stmt.bind_int(2, file_info_id); - stmt.bind_text(3, meta_type); - stmt.bind_blob(4, blob.data(), static_cast(blob.size())); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert metadata lines: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -void insert_metadata_lines(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view meta_type, - std::span line_numbers) { - std::vector vec(line_numbers.begin(), line_numbers.end()); - insert_metadata_lines(db, file_info_id, checkpoint_idx, meta_type, vec); -} - -} // namespace - // dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_provenance.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_provenance.cpp deleted file mode 100644 index f7e6d314..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/insert_provenance.cpp +++ /dev/null @@ -1,95 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; -using indexer::internal::IndexerError; - -void insert_provenance_info(const SqliteDatabase& db, std::string_view key, - std::string_view value) { - SqliteStmt stmt(db, - "INSERT OR REPLACE INTO provenance_info(key, value) " - "VALUES(?, ?);"); - - stmt.bind_text(1, key); - stmt.bind_text(2, value); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert provenance info: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -void insert_provenance_source(const SqliteDatabase& db, int file_info_id, - int source_idx, std::string_view path, - int num_checkpoints, - std::string_view event_hash) { - SqliteStmt stmt(db, - "INSERT OR REPLACE INTO provenance_sources" - "(source_idx, file_info_id, path, " - "num_checkpoints, event_hash) " - "VALUES(?, ?, ?, ?, ?);"); - - stmt.bind_int(1, source_idx); - stmt.bind_int(2, file_info_id); - stmt.bind_text(3, path); - stmt.bind_int(4, num_checkpoints); - stmt.bind_text(5, event_hash); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert provenance source: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -void insert_provenance_group(const SqliteDatabase& db, std::string_view name, - std::string_view predicate) { - SqliteStmt stmt(db, - "INSERT INTO provenance_group(name, predicate) " - "VALUES(?, ?);"); - - stmt.bind_text(1, name); - stmt.bind_text(2, predicate); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert provenance group: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -void insert_provenance_segment(const SqliteDatabase& db, int source_idx, - int source_checkpoint, int output_line_start, - int output_line_end, int event_count) { - SqliteStmt stmt(db, - "INSERT INTO provenance_segments" - "(source_idx, source_checkpoint, " - "output_line_start, output_line_end, " - "event_count) " - "VALUES(?, ?, ?, ?, ?);"); - - stmt.bind_int(1, source_idx); - stmt.bind_int(2, source_checkpoint); - stmt.bind_int(3, output_line_start); - stmt.bind_int(4, output_line_end); - stmt.bind_int(5, event_count); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert provenance segment: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace - // dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h b/src/dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h index b5648fdd..4c3af623 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h +++ b/src/dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h @@ -1,49 +1,32 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_MANIFEST_QUERIES_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_MANIFEST_QUERIES_H -#include - #include -#include +#include #include -#include #include namespace dftracer::utils::utilities::composites::dft::indexing::queries { -using dftracer::utils::sqlite::SqliteDatabase; - // --- Packed line numbers helpers --- -std::vector pack_line_numbers( - const std::vector& lines); - -std::vector unpack_line_numbers(const unsigned char* data, - std::size_t size); - -// --- Insert operations --- - -void insert_event_range(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, std::string_view cat, - std::string_view name, - const std::vector& line_numbers); - -void insert_event_range(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, std::string_view cat, - std::string_view name, - std::span line_numbers); - -void insert_metadata_lines(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view meta_type, - const std::vector& line_numbers); - -void insert_metadata_lines(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view meta_type, - std::span line_numbers); - -// --- Query operations --- +inline std::vector pack_line_numbers( + const std::vector& lines) { + std::vector blob(lines.size() * sizeof(std::uint32_t)); + if (!blob.empty()) { + std::memcpy(blob.data(), lines.data(), blob.size()); + } + return blob; +} + +inline std::vector unpack_line_numbers(const unsigned char* data, + std::size_t size) { + std::vector lines(size / sizeof(std::uint32_t)); + if (!lines.empty()) { + std::memcpy(lines.data(), data, lines.size() * sizeof(std::uint32_t)); + } + return lines; +} struct EventRangeResult { std::uint64_t checkpoint_idx; @@ -53,46 +36,12 @@ struct EventRangeResult { std::uint64_t event_count; }; -std::vector query_event_ranges(const SqliteDatabase& db, - int file_info_id); - -std::vector query_event_ranges_for_checkpoint( - const SqliteDatabase& db, int file_info_id, std::uint64_t checkpoint_idx); - struct MetadataLinesResult { std::uint64_t checkpoint_idx; std::string meta_type; std::vector line_numbers; }; -std::vector query_metadata_lines(const SqliteDatabase& db, - int file_info_id); - -std::vector query_metadata_lines_for_checkpoint( - const SqliteDatabase& db, int file_info_id, std::uint64_t checkpoint_idx); - -// --- Delete operations --- - -void delete_event_ranges(const SqliteDatabase& db, int file_info_id); - -void delete_metadata_lines(const SqliteDatabase& db, int file_info_id); - -// --- Provenance operations --- - -void insert_provenance_info(const SqliteDatabase& db, std::string_view key, - std::string_view value); - -void insert_provenance_source(const SqliteDatabase& db, int file_info_id, - int source_idx, std::string_view path, - int num_checkpoints, std::string_view event_hash); - -void insert_provenance_group(const SqliteDatabase& db, std::string_view name, - std::string_view predicate); - -void insert_provenance_segment(const SqliteDatabase& db, int source_idx, - int source_checkpoint, int output_line_start, - int output_line_end, int event_count); - struct ProvenanceSource { int source_idx; std::string path; @@ -100,9 +49,6 @@ struct ProvenanceSource { std::string event_hash; }; -std::vector query_provenance_sources(const SqliteDatabase& db, - int file_info_id); - struct ProvenanceSegment { int source_idx; int source_checkpoint; @@ -111,19 +57,6 @@ struct ProvenanceSegment { int event_count; }; -std::vector query_provenance_segments( - const SqliteDatabase& db, int source_idx); - -std::vector query_all_provenance_segments( - const SqliteDatabase& db); - -std::string query_provenance_info(const SqliteDatabase& db, - std::string_view key); - -std::string query_provenance_group_name(const SqliteDatabase& db); - -std::string query_provenance_group_predicate(const SqliteDatabase& db); - } // namespace // dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/queries.h b/src/dftracer/utils/utilities/composites/dft/indexing/queries/queries.h index 09dcd6fc..2e392a40 100644 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/queries.h +++ b/src/dftracer/utils/utilities/composites/dft/indexing/queries/queries.h @@ -1,168 +1,38 @@ #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_QUERIES_H #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_QUERIES_H -#include -#include #include #include #include #include -#include -#include #include -#include -#include #include namespace dftracer::utils::utilities::composites::dft::indexing::queries { -using dftracer::utils::sqlite::SqliteDatabase; -using dftracer::utils::sqlite::SqliteStmt; - -// --- Insert operations --- - -void insert_chunk_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view dimension, - const void* blob_data, int blob_size, - std::uint64_t num_entries); - -void insert_chunk_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view dimension, - std::span blob_data, - std::uint64_t num_entries); - -void insert_file_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::string_view dimension, const void* blob_data, - int blob_size, std::uint64_t num_entries); - -void insert_file_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::string_view dimension, - std::span blob_data, - std::uint64_t num_entries); - -void insert_chunk_statistics(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - const ChunkStatistics& stats); - -void insert_index_dimension(const SqliteDatabase& db, int file_info_id, - std::string_view dimension); - -void insert_hash_resolution(const SqliteDatabase& db, int file_info_id, - std::string_view dimension, - std::string_view hash_value, - std::string_view resolved_value); - -SqliteStmt prepare_insert_chunk_bloom_filter(const SqliteDatabase& db); -void insert_chunk_bloom_filter(SqliteStmt& stmt, int file_info_id, - std::uint64_t checkpoint_idx, - std::string_view dimension, - const void* blob_data, int blob_size, - std::uint64_t num_entries); - -SqliteStmt prepare_insert_chunk_dimension_stats(const SqliteDatabase& db); -void insert_chunk_dimension_stats(SqliteStmt& stmt, int file_info_id, - std::uint64_t checkpoint_idx, - const ChunkDimensionStats& stats, - std::size_t value_counts_cap); - -SqliteStmt prepare_insert_hash_resolution(const SqliteDatabase& db); -void insert_hash_resolution(SqliteStmt& stmt, int file_info_id, - std::string_view dimension, - std::string_view hash_value, - std::string_view resolved_value); - -// --- Query operations --- - struct ChunkBloomResult { std::uint64_t checkpoint_idx; std::vector bloom_data; std::uint64_t num_entries; }; -std::vector query_chunk_bloom_filters( - const SqliteDatabase& db, int file_info_id, std::string_view dimension); - -/// Fetch chunk bloom filters for ALL specified dimensions in one query. -std::unordered_map> -query_chunk_bloom_filters_batch(const SqliteDatabase& db, int file_info_id, - const std::vector& dimensions); - struct FileBloomResult { std::vector bloom_data; std::uint64_t num_entries; }; -std::optional query_file_bloom_filter( - const SqliteDatabase& db, int file_info_id, std::string_view dimension); - -/// Fetch file-level bloom filters for ALL specified dimensions in one query. -std::unordered_map query_file_bloom_filters_batch( - const SqliteDatabase& db, int file_info_id, - const std::vector& dimensions); - -std::vector query_index_dimensions(const SqliteDatabase& db, - int file_info_id); - -bool has_index_dimension(const SqliteDatabase& db, int file_info_id, - std::string_view dimension); - struct ChunkStatisticsResult { std::uint64_t checkpoint_idx; ChunkStatistics stats; }; -std::vector query_chunk_statistics( - const SqliteDatabase& db, int file_info_id); - struct TimeBounds { std::uint64_t min_timestamp_us = std::numeric_limits::max(); std::uint64_t max_timestamp_us = 0; bool valid = false; }; -/// Fast aggregate query: single-row SELECT MIN/MAX on chunk_statistics. -TimeBounds query_time_bounds(const SqliteDatabase& db, int file_info_id); - -std::vector query_hash_by_resolved( - const SqliteDatabase& db, std::string_view dimension, - std::string_view resolved_value); - -std::optional query_resolved_by_hash(const SqliteDatabase& db, - std::string_view dimension, - std::string_view hash_value); - -// --- Chunk dimension stats --- - -void insert_chunk_dimension_stats(const SqliteDatabase& db, int file_info_id, - std::uint64_t checkpoint_idx, - const ChunkDimensionStats& stats, - std::size_t value_counts_cap = 4096); - -std::vector query_chunk_dimension_stats( - const SqliteDatabase& db, int file_info_id); - -std::vector -query_chunk_dimension_stats_for_dimension(const SqliteDatabase& db, - int file_info_id, - std::string_view dimension); - -void delete_chunk_dimension_stats(const SqliteDatabase& db, int file_info_id); - -// --- Delete operations --- - -void delete_chunk_bloom_filters(const SqliteDatabase& db, int file_info_id, - std::string_view dimension); - -void delete_file_bloom_filter(const SqliteDatabase& db, int file_info_id, - std::string_view dimension); - -void delete_chunk_statistics(const SqliteDatabase& db, int file_info_id); - -void delete_hash_resolutions(const SqliteDatabase& db, int file_info_id); - } // namespace dftracer::utils::utilities::composites::dft::indexing::queries #endif // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_QUERIES_H diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters.cpp deleted file mode 100644 index 7c940afb..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include -#include -#include - -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::vector query_chunk_bloom_filters( - const SqliteDatabase& db, int file_info_id, std::string_view dimension) { - SqliteStmt stmt(db, - "SELECT checkpoint_idx, bloom_data, num_entries " - "FROM chunk_bloom_filters " - "WHERE file_info_id = ? AND dimension = ? " - "ORDER BY checkpoint_idx;"); - - stmt.bind_int(1, file_info_id); - stmt.bind_text(2, dimension); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - ChunkBloomResult r; - r.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - - const void* blob = sqlite3_column_blob(stmt, 1); - int blob_size = sqlite3_column_bytes(stmt, 1); - if (blob && blob_size > 0) { - r.bloom_data.resize(static_cast(blob_size)); - std::memcpy(r.bloom_data.data(), blob, - static_cast(blob_size)); - } - - r.num_entries = - static_cast(sqlite3_column_int64(stmt, 2)); - results.push_back(std::move(r)); - } - - return results; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters_batch.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters_batch.cpp deleted file mode 100644 index d8ef11ed..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_bloom_filters_batch.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::unordered_map> -query_chunk_bloom_filters_batch(const SqliteDatabase& db, int file_info_id, - const std::vector& dimensions) { - std::unordered_map> results; - if (dimensions.empty()) return results; - - std::string sql = - "SELECT dimension, checkpoint_idx, bloom_data, num_entries " - "FROM chunk_bloom_filters " - "WHERE file_info_id = ? AND dimension IN ("; - for (std::size_t i = 0; i < dimensions.size(); ++i) { - if (i > 0) sql += ','; - sql += '?'; - } - sql += ") ORDER BY dimension, checkpoint_idx;"; - - SqliteStmt stmt(db, sql.c_str()); - stmt.bind_int(1, file_info_id); - for (std::size_t i = 0; i < dimensions.size(); ++i) { - stmt.bind_text(static_cast(i + 2), dimensions[i]); - } - - while (sqlite3_step(stmt) == SQLITE_ROW) { - const char* dim_text = - reinterpret_cast(sqlite3_column_text(stmt, 0)); - std::string dim(dim_text ? dim_text : ""); - - ChunkBloomResult r; - r.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 1)); - - const void* blob = sqlite3_column_blob(stmt, 2); - int blob_size = sqlite3_column_bytes(stmt, 2); - if (blob && blob_size > 0) { - r.bloom_data.resize(static_cast(blob_size)); - std::memcpy(r.bloom_data.data(), blob, - static_cast(blob_size)); - } - - r.num_entries = - static_cast(sqlite3_column_int64(stmt, 3)); - results[dim].push_back(std::move(r)); - } - - return results; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_dimension_stats.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_dimension_stats.cpp deleted file mode 100644 index 410a3835..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_dimension_stats.cpp +++ /dev/null @@ -1,119 +0,0 @@ -#include -#include -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::vector query_chunk_dimension_stats( - const SqliteDatabase& db, int file_info_id) { - SqliteStmt stmt(db, - "SELECT checkpoint_idx, dimension, distinct_count, " - "min_value, max_value, value_type, value_counts " - "FROM chunk_dimension_stats WHERE file_info_id = ? " - "ORDER BY checkpoint_idx, dimension;"); - - stmt.bind_int(1, file_info_id); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - ChunkDimensionStatsResult r; - r.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - - const char* dim = - reinterpret_cast(sqlite3_column_text(stmt, 1)); - r.dimension = dim ? dim : ""; - - r.distinct_count = - static_cast(sqlite3_column_int64(stmt, 2)); - - const char* min_val = - reinterpret_cast(sqlite3_column_text(stmt, 3)); - r.min_value = min_val ? min_val : ""; - - const char* max_val = - reinterpret_cast(sqlite3_column_text(stmt, 4)); - r.max_value = max_val ? max_val : ""; - - const char* vtype = - reinterpret_cast(sqlite3_column_text(stmt, 5)); - r.value_type = vtype ? vtype : "string"; - - // value_counts BLOB (compressed, may be NULL) - if (sqlite3_column_type(stmt, 6) != SQLITE_NULL) { - auto* blob = - static_cast(sqlite3_column_blob(stmt, 6)); - auto blob_len = - static_cast(sqlite3_column_bytes(stmt, 6)); - if (blob && blob_len > 0) { - r.value_counts = ChunkDimensionStats::decompress_value_counts( - blob, blob_len); - } - } - - results.push_back(std::move(r)); - } - - return results; -} - -std::vector -query_chunk_dimension_stats_for_dimension(const SqliteDatabase& db, - int file_info_id, - std::string_view dimension) { - SqliteStmt stmt(db, - "SELECT checkpoint_idx, dimension, distinct_count, " - "min_value, max_value, value_type, value_counts " - "FROM chunk_dimension_stats " - "WHERE file_info_id = ? AND dimension = ? " - "ORDER BY checkpoint_idx;"); - - stmt.bind_int(1, file_info_id); - stmt.bind_text(2, dimension); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - ChunkDimensionStatsResult r; - r.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - - const char* dim = - reinterpret_cast(sqlite3_column_text(stmt, 1)); - r.dimension = dim ? dim : ""; - - r.distinct_count = - static_cast(sqlite3_column_int64(stmt, 2)); - - const char* min_val = - reinterpret_cast(sqlite3_column_text(stmt, 3)); - r.min_value = min_val ? min_val : ""; - - const char* max_val = - reinterpret_cast(sqlite3_column_text(stmt, 4)); - r.max_value = max_val ? max_val : ""; - - const char* vtype = - reinterpret_cast(sqlite3_column_text(stmt, 5)); - r.value_type = vtype ? vtype : "string"; - - if (sqlite3_column_type(stmt, 6) != SQLITE_NULL) { - auto* blob = - static_cast(sqlite3_column_blob(stmt, 6)); - auto blob_len = - static_cast(sqlite3_column_bytes(stmt, 6)); - if (blob && blob_len > 0) { - r.value_counts = ChunkDimensionStats::decompress_value_counts( - blob, blob_len); - } - } - - results.push_back(std::move(r)); - } - - return results; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_statistics.cpp deleted file mode 100644 index 263f6684..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_chunk_statistics.cpp +++ /dev/null @@ -1,142 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::vector query_chunk_statistics( - const SqliteDatabase& db, int file_info_id) { - SqliteStmt stmt( - db, - "SELECT checkpoint_idx, total_events, " - "min_timestamp_us, max_timestamp_us, " - "duration_sum_us, duration_min_us, duration_max_us, duration_count, " - "duration_m2, duration_sketch, duration_histogram, " - "name_duration_sketches, name_duration_histograms, " - "name_duration_sums, name_duration_sum_sqs, name_category " - "FROM chunk_statistics WHERE file_info_id = ? " - "ORDER BY checkpoint_idx;"); - - stmt.bind_int(1, file_info_id); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - ChunkStatisticsResult r; - r.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - - r.stats.total_events = - static_cast(sqlite3_column_int64(stmt, 1)); - - // Timestamps (may be NULL) - if (sqlite3_column_type(stmt, 2) != SQLITE_NULL) { - r.stats.min_timestamp_us = - static_cast(sqlite3_column_int64(stmt, 2)); - } else { - r.stats.min_timestamp_us = - std::numeric_limits::max(); - } - - if (sqlite3_column_type(stmt, 3) != SQLITE_NULL) { - r.stats.max_timestamp_us = - static_cast(sqlite3_column_int64(stmt, 3)); - } else { - r.stats.max_timestamp_us = 0; - } - - r.stats.duration_sum_us = sqlite3_column_int64(stmt, 4); - - if (sqlite3_column_type(stmt, 5) != SQLITE_NULL) { - r.stats.duration_min_us = - static_cast(sqlite3_column_int64(stmt, 5)); - } else { - r.stats.duration_min_us = std::numeric_limits::max(); - } - - if (sqlite3_column_type(stmt, 6) != SQLITE_NULL) { - r.stats.duration_max_us = - static_cast(sqlite3_column_int64(stmt, 6)); - } else { - r.stats.duration_max_us = 0; - } - - r.stats.duration_count = - static_cast(sqlite3_column_int64(stmt, 7)); - r.stats.duration_m2 = sqlite3_column_double(stmt, 8); - - // duration_sketch BLOB (column 9) - if (sqlite3_column_type(stmt, 9) != SQLITE_NULL) { - auto* blob = - static_cast(sqlite3_column_blob(stmt, 9)); - auto blob_len = - static_cast(sqlite3_column_bytes(stmt, 9)); - if (blob && blob_len > 0) { - using dftracer::utils::utilities::common::statistics::DDSketch; - r.stats.duration_sketch = DDSketch::deserialize(blob, blob_len); - } - } - - // duration_histogram TEXT (column 10) - const char* dh_text = - reinterpret_cast(sqlite3_column_text(stmt, 10)); - if (dh_text) { - using dftracer::utils::utilities::common::statistics::Log2Histogram; - r.stats.duration_histogram = Log2Histogram::from_json(dh_text); - } - - // name_duration_sketches BLOB (column 11) - if (sqlite3_column_type(stmt, 11) != SQLITE_NULL) { - auto* blob = - static_cast(sqlite3_column_blob(stmt, 11)); - auto blob_len = - static_cast(sqlite3_column_bytes(stmt, 11)); - if (blob && blob_len > 0) { - r.stats.name_duration_sketches = - ChunkStatistics::deserialize_name_duration_sketches( - blob, blob_len); - } - } - - // name_duration_histograms TEXT (column 12) - const char* ndh_text = - reinterpret_cast(sqlite3_column_text(stmt, 12)); - if (ndh_text) { - r.stats.name_duration_histograms = - ChunkStatistics::parse_histogram_map_json(ndh_text); - } - - // name_duration_sums TEXT (column 13) - const char* nds_text = - reinterpret_cast(sqlite3_column_text(stmt, 13)); - if (nds_text) { - r.stats.name_duration_sums = - ChunkStatistics::parse_double_map_json(nds_text); - } - - // name_duration_sum_sqs TEXT (column 14) - const char* ndss_text = - reinterpret_cast(sqlite3_column_text(stmt, 14)); - if (ndss_text) { - r.stats.name_duration_sum_sqs = - ChunkStatistics::parse_double_map_json(ndss_text); - } - - // name_category TEXT (column 15) - const char* nc_text = - reinterpret_cast(sqlite3_column_text(stmt, 15)); - if (nc_text) { - r.stats.name_category = - ChunkStatistics::parse_string_map_json(nc_text); - } - - results.push_back(std::move(r)); - } - - return results; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_event_ranges.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_event_ranges.cpp deleted file mode 100644 index b108758e..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_event_ranges.cpp +++ /dev/null @@ -1,72 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::vector query_event_ranges(const SqliteDatabase& db, - int file_info_id) { - SqliteStmt stmt(db, - "SELECT checkpoint_idx, cat, name, line_numbers, " - "event_count " - "FROM checkpoint_event_ranges " - "WHERE file_info_id = ? " - "ORDER BY checkpoint_idx, cat, name;"); - stmt.bind_int(1, file_info_id); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - EventRangeResult r; - r.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - r.cat = reinterpret_cast(sqlite3_column_text(stmt, 1)); - r.name = reinterpret_cast(sqlite3_column_text(stmt, 2)); - - const auto* blob_data = - static_cast(sqlite3_column_blob(stmt, 3)); - int blob_size = sqlite3_column_bytes(stmt, 3); - r.line_numbers = - unpack_line_numbers(blob_data, static_cast(blob_size)); - - r.event_count = - static_cast(sqlite3_column_int64(stmt, 4)); - results.push_back(std::move(r)); - } - return results; -} - -std::vector query_event_ranges_for_checkpoint( - const SqliteDatabase& db, int file_info_id, std::uint64_t checkpoint_idx) { - SqliteStmt stmt(db, - "SELECT checkpoint_idx, cat, name, line_numbers, " - "event_count " - "FROM checkpoint_event_ranges " - "WHERE file_info_id = ? AND checkpoint_idx = ? " - "ORDER BY cat, name;"); - stmt.bind_int(1, file_info_id); - stmt.bind_int64(2, static_cast(checkpoint_idx)); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - EventRangeResult r; - r.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - r.cat = reinterpret_cast(sqlite3_column_text(stmt, 1)); - r.name = reinterpret_cast(sqlite3_column_text(stmt, 2)); - - const auto* blob_data = - static_cast(sqlite3_column_blob(stmt, 3)); - int blob_size = sqlite3_column_bytes(stmt, 3); - r.line_numbers = - unpack_line_numbers(blob_data, static_cast(blob_size)); - - r.event_count = - static_cast(sqlite3_column_int64(stmt, 4)); - results.push_back(std::move(r)); - } - return results; -} - -} // namespace - // dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filter.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filter.cpp deleted file mode 100644 index bb12e59d..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filter.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include -#include -#include - -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::optional query_file_bloom_filter( - const SqliteDatabase& db, int file_info_id, std::string_view dimension) { - SqliteStmt stmt(db, - "SELECT bloom_data, num_entries " - "FROM file_bloom_filters " - "WHERE file_info_id = ? AND dimension = ?;"); - - stmt.bind_int(1, file_info_id); - stmt.bind_text(2, dimension); - - int rc = sqlite3_step(stmt); - if (rc == SQLITE_ROW) { - FileBloomResult r; - const void* blob = sqlite3_column_blob(stmt, 0); - int blob_size = sqlite3_column_bytes(stmt, 0); - if (blob && blob_size > 0) { - r.bloom_data.resize(static_cast(blob_size)); - std::memcpy(r.bloom_data.data(), blob, - static_cast(blob_size)); - } - r.num_entries = - static_cast(sqlite3_column_int64(stmt, 1)); - return r; - } - - return std::nullopt; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filters_batch.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filters_batch.cpp deleted file mode 100644 index e18c2558..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_file_bloom_filters_batch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::unordered_map query_file_bloom_filters_batch( - const SqliteDatabase& db, int file_info_id, - const std::vector& dimensions) { - std::unordered_map results; - if (dimensions.empty()) return results; - - std::string sql = - "SELECT dimension, bloom_data, num_entries " - "FROM file_bloom_filters " - "WHERE file_info_id = ? AND dimension IN ("; - for (std::size_t i = 0; i < dimensions.size(); ++i) { - if (i > 0) sql += ','; - sql += '?'; - } - sql += ");"; - - SqliteStmt stmt(db, sql.c_str()); - stmt.bind_int(1, file_info_id); - for (std::size_t i = 0; i < dimensions.size(); ++i) { - stmt.bind_text(static_cast(i + 2), dimensions[i]); - } - - while (sqlite3_step(stmt) == SQLITE_ROW) { - const char* dim_text = - reinterpret_cast(sqlite3_column_text(stmt, 0)); - std::string dim(dim_text ? dim_text : ""); - - FileBloomResult r; - const void* blob = sqlite3_column_blob(stmt, 1); - int blob_size = sqlite3_column_bytes(stmt, 1); - if (blob && blob_size > 0) { - r.bloom_data.resize(static_cast(blob_size)); - std::memcpy(r.bloom_data.data(), blob, - static_cast(blob_size)); - } - - r.num_entries = - static_cast(sqlite3_column_int64(stmt, 2)); - results[dim] = std::move(r); - } - - return results; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_hash_by_resolved.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_hash_by_resolved.cpp deleted file mode 100644 index d16f8088..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_hash_by_resolved.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::vector query_hash_by_resolved( - const SqliteDatabase& db, std::string_view dimension, - std::string_view resolved_value) { - SqliteStmt stmt(db, - "SELECT DISTINCT hash_value FROM hash_resolutions " - "WHERE dimension = ? AND resolved_value = ?;"); - - stmt.bind_text(1, dimension); - stmt.bind_text(2, resolved_value); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - const char* text = - reinterpret_cast(sqlite3_column_text(stmt, 0)); - if (text) { - results.emplace_back(text); - } - } - - return results; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_index_dimensions.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_index_dimensions.cpp deleted file mode 100644 index afb55d1d..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_index_dimensions.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::vector query_index_dimensions(const SqliteDatabase& db, - int file_info_id) { - SqliteStmt stmt( - db, "SELECT dimension FROM index_dimensions WHERE file_info_id = ?;"); - - stmt.bind_int(1, file_info_id); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - const char* text = - reinterpret_cast(sqlite3_column_text(stmt, 0)); - if (text) { - results.emplace_back(text); - } - } - - return results; -} - -bool has_index_dimension(const SqliteDatabase& db, int file_info_id, - std::string_view dimension) { - SqliteStmt stmt(db, - "SELECT 1 FROM index_dimensions " - "WHERE file_info_id = ? AND dimension = ? LIMIT 1;"); - - stmt.bind_int(1, file_info_id); - stmt.bind_text(2, dimension); - - return sqlite3_step(stmt) == SQLITE_ROW; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_metadata_lines.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_metadata_lines.cpp deleted file mode 100644 index b9b8f325..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_metadata_lines.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::vector query_metadata_lines(const SqliteDatabase& db, - int file_info_id) { - SqliteStmt stmt(db, - "SELECT checkpoint_idx, meta_type, line_numbers " - "FROM checkpoint_metadata_lines " - "WHERE file_info_id = ? " - "ORDER BY checkpoint_idx, meta_type;"); - stmt.bind_int(1, file_info_id); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - MetadataLinesResult r; - r.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - r.meta_type = - reinterpret_cast(sqlite3_column_text(stmt, 1)); - - const auto* blob_data = - static_cast(sqlite3_column_blob(stmt, 2)); - int blob_size = sqlite3_column_bytes(stmt, 2); - r.line_numbers = - unpack_line_numbers(blob_data, static_cast(blob_size)); - - results.push_back(std::move(r)); - } - return results; -} - -std::vector query_metadata_lines_for_checkpoint( - const SqliteDatabase& db, int file_info_id, std::uint64_t checkpoint_idx) { - SqliteStmt stmt(db, - "SELECT checkpoint_idx, meta_type, line_numbers " - "FROM checkpoint_metadata_lines " - "WHERE file_info_id = ? AND checkpoint_idx = ? " - "ORDER BY meta_type;"); - stmt.bind_int(1, file_info_id); - stmt.bind_int64(2, static_cast(checkpoint_idx)); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - MetadataLinesResult r; - r.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - r.meta_type = - reinterpret_cast(sqlite3_column_text(stmt, 1)); - - const auto* blob_data = - static_cast(sqlite3_column_blob(stmt, 2)); - int blob_size = sqlite3_column_bytes(stmt, 2); - r.line_numbers = - unpack_line_numbers(blob_data, static_cast(blob_size)); - - results.push_back(std::move(r)); - } - return results; -} - -} // namespace - // dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_provenance.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_provenance.cpp deleted file mode 100644 index aab5abfb..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_provenance.cpp +++ /dev/null @@ -1,117 +0,0 @@ -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::vector query_provenance_sources(const SqliteDatabase& db, - int file_info_id) { - SqliteStmt stmt(db, - "SELECT source_idx, path, num_checkpoints, " - "event_hash " - "FROM provenance_sources " - "WHERE file_info_id = ? " - "ORDER BY source_idx;"); - stmt.bind_int(1, file_info_id); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - ProvenanceSource s; - s.source_idx = sqlite3_column_int(stmt, 0); - s.path = reinterpret_cast(sqlite3_column_text(stmt, 1)); - s.num_checkpoints = sqlite3_column_int(stmt, 2); - s.event_hash = - reinterpret_cast(sqlite3_column_text(stmt, 3)); - results.push_back(std::move(s)); - } - return results; -} - -std::vector query_provenance_segments( - const SqliteDatabase& db, int source_idx) { - SqliteStmt stmt(db, - "SELECT source_idx, source_checkpoint, " - "output_line_start, output_line_end, " - "event_count " - "FROM provenance_segments " - "WHERE source_idx = ? " - "ORDER BY source_checkpoint, " - "output_line_start;"); - stmt.bind_int(1, source_idx); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - ProvenanceSegment s; - s.source_idx = sqlite3_column_int(stmt, 0); - s.source_checkpoint = sqlite3_column_int(stmt, 1); - s.output_line_start = sqlite3_column_int(stmt, 2); - s.output_line_end = sqlite3_column_int(stmt, 3); - s.event_count = sqlite3_column_int(stmt, 4); - results.push_back(std::move(s)); - } - return results; -} - -std::vector query_all_provenance_segments( - const SqliteDatabase& db) { - SqliteStmt stmt(db, - "SELECT source_idx, source_checkpoint, " - "output_line_start, output_line_end, " - "event_count " - "FROM provenance_segments " - "ORDER BY output_line_start;"); - - std::vector results; - while (sqlite3_step(stmt) == SQLITE_ROW) { - ProvenanceSegment s; - s.source_idx = sqlite3_column_int(stmt, 0); - s.source_checkpoint = sqlite3_column_int(stmt, 1); - s.output_line_start = sqlite3_column_int(stmt, 2); - s.output_line_end = sqlite3_column_int(stmt, 3); - s.event_count = sqlite3_column_int(stmt, 4); - results.push_back(std::move(s)); - } - return results; -} - -std::string query_provenance_info(const SqliteDatabase& db, - std::string_view key) { - SqliteStmt stmt(db, - "SELECT value FROM provenance_info " - "WHERE key = ?;"); - stmt.bind_text(1, key); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - return reinterpret_cast(sqlite3_column_text(stmt, 0)); - } - return ""; -} - -std::string query_provenance_group_name(const SqliteDatabase& db) { - SqliteStmt stmt(db, "SELECT name FROM provenance_group LIMIT 1;"); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - return reinterpret_cast(sqlite3_column_text(stmt, 0)); - } - return ""; -} - -std::string query_provenance_group_predicate(const SqliteDatabase& db) { - SqliteStmt stmt(db, - "SELECT predicate FROM provenance_group " - "LIMIT 1;"); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - if (sqlite3_column_type(stmt, 0) == SQLITE_NULL) { - return ""; - } - return reinterpret_cast(sqlite3_column_text(stmt, 0)); - } - return ""; -} - -} // namespace - // dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_resolved_by_hash.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_resolved_by_hash.cpp deleted file mode 100644 index 37a918e0..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_resolved_by_hash.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -std::optional query_resolved_by_hash(const SqliteDatabase& db, - std::string_view dimension, - std::string_view hash_value) { - SqliteStmt stmt(db, - "SELECT resolved_value FROM hash_resolutions " - "WHERE dimension = ? AND hash_value = ? LIMIT 1;"); - - stmt.bind_text(1, dimension); - stmt.bind_text(2, hash_value); - - int rc = sqlite3_step(stmt); - if (rc == SQLITE_ROW) { - const char* text = - reinterpret_cast(sqlite3_column_text(stmt, 0)); - if (text) { - return std::string(text); - } - } - - return std::nullopt; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_time_bounds.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_time_bounds.cpp deleted file mode 100644 index 3fb128e3..00000000 --- a/src/dftracer/utils/utilities/composites/dft/indexing/queries/query_time_bounds.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include - -#include - -namespace dftracer::utils::utilities::composites::dft::indexing::queries { - -using dftracer::utils::sqlite::SqliteStmt; - -TimeBounds query_time_bounds(const SqliteDatabase& db, int file_info_id) { - SqliteStmt stmt(db, - "SELECT MIN(min_timestamp_us), MAX(max_timestamp_us) " - "FROM chunk_statistics WHERE file_info_id = ? " - "AND min_timestamp_us IS NOT NULL " - "AND max_timestamp_us IS NOT NULL;"); - - stmt.bind_int(1, file_info_id); - - TimeBounds result; - if (sqlite3_step(stmt) == SQLITE_ROW) { - if (sqlite3_column_type(stmt, 0) != SQLITE_NULL) { - result.min_timestamp_us = - static_cast(sqlite3_column_int64(stmt, 0)); - } - if (sqlite3_column_type(stmt, 1) != SQLITE_NULL) { - result.max_timestamp_us = - static_cast(sqlite3_column_int64(stmt, 1)); - } - result.valid = (result.min_timestamp_us != - std::numeric_limits::max()); - } - - return result; -} - -} // namespace dftracer::utils::utilities::composites::dft::indexing::queries diff --git a/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp b/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp index 6fb8586c..150a61d8 100644 --- a/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp +++ b/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp @@ -11,26 +11,14 @@ namespace dftracer::utils::utilities::composites::dft::internal { std::string determine_index_path(const std::string& file_path, const std::string& index_dir) { fs::path data_path(file_path); - std::string base_name = - data_path.filename().string() + constants::indexer::EXTENSION; - - if (!index_dir.empty()) { - return (fs::path(index_dir) / base_name).string(); - } - - return (data_path.parent_path() / base_name).string(); + fs::path root = + index_dir.empty() ? data_path.parent_path() : fs::path(index_dir); + return (root / ".dftindex").string(); } std::string determine_provenance_index_path(const std::string& data_path, const std::string& index_dir) { - fs::path path(data_path); - std::string base_name = path.filename().string() + ".pidx"; - - if (!index_dir.empty()) { - return (fs::path(index_dir) / base_name).string(); - } - - return (path.parent_path() / base_name).string(); + return determine_index_path(data_path, index_dir); } bool is_data_transfer_op(std::string_view cat, std::string_view name) { diff --git a/src/dftracer/utils/utilities/composites/dft/metadata_collector_utility.cpp b/src/dftracer/utils/utilities/composites/dft/metadata_collector_utility.cpp index c7b1b4d0..8697b8d8 100644 --- a/src/dftracer/utils/utilities/composites/dft/metadata_collector_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/metadata_collector_utility.cpp @@ -2,10 +2,12 @@ #include #include #include +#include #include #include #include #include +#include #include namespace dftracer::utils::utilities::composites::dft { @@ -28,11 +30,15 @@ MetadataCollectorUtility::process(const MetadataCollectorUtilityInput& input) { if (is_compressed) { // Compressed file - generate index path if not provided MetadataCollectorUtilityInput modified_input = input; - if (modified_input.idx_path.empty()) { - // Auto-generate index path - modified_input.idx_path = file_path + ".idx"; + if (modified_input.index_path.empty()) { + modified_input.index_path = + internal::determine_index_path(file_path, ""); + } else { + modified_input.index_path = + dftracer::utils::utilities::indexer::internal:: + normalize_index_root(modified_input.index_path); } - meta.idx_path = modified_input.idx_path; + meta.index_path = modified_input.index_path; co_return co_await process_compressed(modified_input); } else { // Plain text file @@ -50,7 +56,7 @@ MetadataCollectorUtility::process_compressed( const MetadataCollectorUtilityInput& input) { MetadataCollectorUtilityOutput meta; meta.file_path = input.file_path; - meta.idx_path = input.idx_path; + meta.index_path = input.index_path; try { // Detect format @@ -59,7 +65,7 @@ MetadataCollectorUtility::process_compressed( meta.compressed_size = fs::file_size(input.file_path); // Check if index exists - meta.has_index = fs::exists(input.idx_path); + meta.has_index = fs::exists(input.index_path); // Create or load indexer std::shared_ptr @@ -67,27 +73,27 @@ MetadataCollectorUtility::process_compressed( if (!meta.has_index || input.force_rebuild) { if (input.force_rebuild && meta.has_index) { DFTRACER_UTILS_LOG_DEBUG("Removing existing index: %s", - input.idx_path.c_str()); - fs::remove(input.idx_path); + input.index_path.c_str()); + fs::remove_all(input.index_path); } DFTRACER_UTILS_LOG_DEBUG("Building index for: %s", input.file_path.c_str()); indexer = dftracer::utils::utilities::indexer::internal:: - IndexerFactory::create(input.file_path, input.idx_path, + IndexerFactory::create(input.file_path, input.index_path, input.checkpoint_size, true); co_await indexer->build_async(); meta.has_index = true; } else { indexer = dftracer::utils::utilities::indexer::internal:: - IndexerFactory::create(input.file_path, input.idx_path, + IndexerFactory::create(input.file_path, input.index_path, input.checkpoint_size, false); if (indexer->need_rebuild()) { DFTRACER_UTILS_LOG_DEBUG("Index needs rebuild: %s", - input.idx_path.c_str()); + input.index_path.c_str()); meta.index_valid = false; - fs::remove(input.idx_path); + fs::remove_all(input.index_path); indexer = dftracer::utils::utilities::indexer::internal:: - IndexerFactory::create(input.file_path, input.idx_path, + IndexerFactory::create(input.file_path, input.index_path, input.checkpoint_size, true); co_await indexer->build_async(); } @@ -124,7 +130,7 @@ MetadataCollectorUtility::process_compressed( auto line_gen = StreamingLineReader::read_async( StreamingLineReaderConfig() .with_file(input.file_path) - .with_index(input.idx_path) + .with_index(input.index_path) .with_line_range(1, total_lines)); while (auto line_opt = co_await line_gen.next()) { const auto& line = *line_opt; @@ -173,7 +179,7 @@ MetadataCollectorUtility::process_plain( const MetadataCollectorUtilityInput& input) { MetadataCollectorUtilityOutput meta; meta.file_path = input.file_path; - meta.idx_path = ""; + meta.index_path = ""; try { // Plain file metadata diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/event_router.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/event_router.cpp index 881f54dd..aabcc54c 100644 --- a/src/dftracer/utils/utilities/composites/dft/reorganize/event_router.cpp +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/event_router.cpp @@ -156,9 +156,9 @@ coro::CoroTask process_source( result.output_files.push_back(chunk.path); } - trackers[gi].flush_to_db(plan, plan.groups[gi].name, - plan.groups[gi].query, writers[gi]->chunks(), - config.output_dir); + co_await trackers[gi].flush_to_db( + plan, plan.groups[gi].name, plan.groups[gi].query, + writers[gi]->chunks(), config.output_dir); } result.success = true; @@ -187,12 +187,14 @@ coro::CoroTask route_events( futures.reserve(tasks_by_source.size()); for (const auto& [src_idx, src_tasks] : tasks_by_source) { + auto* config_ptr = &config; futures.push_back( - scope.spawn([src_idx, &config, tasks = src_tasks, permits]( + scope.spawn([src_idx, config_ptr, tasks = src_tasks, permits]( CoroScope& s) -> coro::CoroTask { co_await s.receive(permits); try { - auto r = co_await process_source(src_idx, config, tasks); + auto r = + co_await process_source(src_idx, *config_ptr, tasks); permits->try_send(true); co_return r; } catch (...) { diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp index 99dc4a35..31a97cce 100644 --- a/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp @@ -1,6 +1,8 @@ #include #include +#include #include +#include #include namespace dftracer::utils::utilities::composites::dft::reorganize { @@ -13,7 +15,7 @@ void ProvenanceTracker::record(int source_file_idx, int checkpoint_idx, output_line_end, event_count}); } -void ProvenanceTracker::flush_to_db( +coro::CoroTask ProvenanceTracker::flush_to_db( const ExtractionPlan& plan, const std::string& group_name, const std::string& group_query, const std::vector& chunks, @@ -21,44 +23,58 @@ void ProvenanceTracker::flush_to_db( using indexer::ProvenanceDatabase; for (const auto& chunk : chunks) { - std::string pidx_path = chunk.path + ".pidx"; + auto provenance_path = std::make_shared( + indexer::determine_provenance_index_path(chunk.path)); + const auto* plan_ptr = &plan; + const auto* group_name_ptr = &group_name; + const auto* group_query_ptr = &group_query; + const auto* chunk_ptr = &chunk; + const auto* records_ptr = &records_; try { - ProvenanceDatabase pdb(pidx_path); - pdb.init_schema(); + co_await rocksdb::run([plan_ptr, group_name_ptr, group_query_ptr, + chunk_ptr, records_ptr, provenance_path] { + ProvenanceDatabase pdb(*provenance_path); + pdb.init_schema(); - std::uint64_t out_hash = 0; - if (fs::exists(chunk.path)) { - out_hash = - static_cast(fs::file_size(chunk.path)); - } - int fid = pdb.get_or_create_file_info(chunk.path, out_hash); + std::uint64_t out_hash = 0; + if (fs::exists(chunk_ptr->path)) { + out_hash = static_cast( + fs::file_size(chunk_ptr->path)); + } + int fid = + pdb.get_or_create_file_info(chunk_ptr->path, out_hash); - pdb.begin_transaction(); + indexer::internal::TransactionScope txn(pdb); + pdb.insert_info(fid, "version", "2.0"); + pdb.insert_info(fid, "tool", "dftracer_organize"); + pdb.insert_group(fid, *group_name_ptr, *group_query_ptr); - pdb.insert_info("version", "2.0"); - pdb.insert_info("tool", "dftracer_organize"); - pdb.insert_group(group_name, group_query); + for (std::size_t si = 0; si < plan_ptr->source_files.size(); + ++si) { + const auto& src = plan_ptr->source_files[si]; + pdb.insert_source(fid, static_cast(si), src.file_path, + static_cast(src.num_checkpoints)); + } - for (std::size_t si = 0; si < plan.source_files.size(); ++si) { - const auto& src = plan.source_files[si]; - pdb.insert_source(fid, static_cast(si), src.file_path, - static_cast(src.num_checkpoints)); - } + for (const auto& rec : *records_ptr) { + if (rec.output_chunk_idx != chunk_ptr->chunk_index) + continue; + pdb.insert_segment(fid, rec.source_file_idx, + rec.checkpoint_idx, + rec.output_line_start, + rec.output_line_end, rec.event_count); + } - for (const auto& rec : records_) { - if (rec.output_chunk_idx != chunk.chunk_index) continue; - pdb.insert_segment(rec.source_file_idx, rec.checkpoint_idx, - rec.output_line_start, rec.output_line_end, - rec.event_count); - } - - pdb.commit_transaction(); + txn.commit(); + }); } catch (const std::exception& e) { DFTRACER_UTILS_LOG_ERROR("Provenance write failed for %s: %s", chunk.path.c_str(), e.what()); } } + + co_return; } } // namespace dftracer::utils::utilities::composites::dft::reorganize diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/reconstruction_planner.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/reconstruction_planner.cpp index 54a39cf8..d0a4678c 100644 --- a/src/dftracer/utils/utilities/composites/dft/reorganize/reconstruction_planner.cpp +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/reconstruction_planner.cpp @@ -19,21 +19,22 @@ coro::CoroTask ReconstructionPlannerUtility::process( ReconstructionPlan plan; for (const auto& reorg_file : input.reorganized_files) { - std::string pidx_path = internal::determine_provenance_index_path( + std::string provenance_path = internal::determine_provenance_index_path( reorg_file, input.index_dir); - if (!fs::exists(pidx_path)) { + if (!fs::exists(provenance_path)) { continue; } - ProvenanceDatabase pdb(pidx_path); - pdb.init_schema(); + ProvenanceDatabase pdb( + provenance_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); int fid = pdb.get_file_info_id(reorg_file); if (fid < 0) continue; // Check if this file has provenance - std::string tool = pdb.query_info("tool"); + std::string tool = pdb.query_info(fid, "tool"); if (tool.empty()) continue; // Read sources @@ -54,7 +55,7 @@ coro::CoroTask ReconstructionPlannerUtility::process( } // Read all segments - auto segments = pdb.query_all_segments(); + auto segments = pdb.query_all_segments(fid); for (const auto& seg : segments) { auto src_it = source_map.find(seg.source_idx); diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp index 98d87f5f..b64904be 100644 --- a/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp +++ b/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -12,6 +13,7 @@ #include #include +#include #include #include #include @@ -20,7 +22,6 @@ namespace dftracer::utils::utilities::composites::dft::reorganize { namespace { -using common::json::JsonValue; using common::query::Query; using dftracer::utils::utilities::indexer::IndexBuildConfig; using dftracer::utils::utilities::indexer::IndexBuilderUtility; @@ -93,7 +94,7 @@ coro::CoroTask ReorganizationPlannerUtility::process( for (std::size_t fi = 0; fi < input.source_files.size(); ++fi) { const auto& file_path = input.source_files[fi]; - // Build .idx if needed + // Build the shared `.dftindex` store if needed. IndexBuilderUtility idx_builder; auto idx_input = IndexBuildConfig::for_file(file_path).with_index_dir( input.index_dir); @@ -109,7 +110,7 @@ coro::CoroTask ReorganizationPlannerUtility::process( MetadataCollectorUtility metadata_collector; auto meta_input = MetadataCollectorUtilityInput::from_file(file_path).with_index( - idx_result.idx_path); + idx_result.index_path); if (input.checkpoint_size > 0) { meta_input.with_checkpoint_size(input.checkpoint_size); } @@ -119,8 +120,8 @@ coro::CoroTask ReorganizationPlannerUtility::process( file_path); } - // Determine .idx path (manifest data now lives in .idx) - std::string idx_path = + // Determine the root-local `.dftindex` store path. + std::string index_path = internal::determine_index_path(file_path, input.index_dir); // Effective checkpoint count: treat 0 as 1 @@ -129,29 +130,26 @@ coro::CoroTask ReorganizationPlannerUtility::process( SourceFileInfo sfi; sfi.file_path = file_path; - sfi.idx_path = idx_result.idx_path; - sfi.idx_path = idx_path; + sfi.index_path = index_path; sfi.num_checkpoints = eff_ckpts; sfi.uncompressed_size = meta.uncompressed_size; sfi.checkpoint_size = meta.checkpoint_size; plan.source_files.push_back(std::move(sfi)); - // Open .idx and try manifest-based planning. Fall back to - // whole-file streaming when manifest tables are absent - // (file was below index_threshold). - IndexDatabase idx_db(idx_path); + // Open the shared index store and try manifest-based planning. Fall + // back to whole-file streaming when manifest tables are absent (file + // was below index_threshold). + IndexDatabase idx_db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); int file_info_id = idx_db.get_file_info_id( indexer::internal::get_logical_path(file_path)); if (file_info_id < 0) { - throw std::runtime_error("File not found in .idx: " + file_path); + throw std::runtime_error("File not found in .dftindex: " + + file_path); } - bool has_manifest = true; - try { - idx_db.query_event_ranges_for_checkpoint(file_info_id, 0); - } catch (const std::exception&) { - has_manifest = false; - } + const bool has_manifest = idx_db.has_manifest_data(file_info_id); if (has_manifest) { // Manifest-based planning: per-checkpoint extraction tasks. @@ -240,8 +238,22 @@ coro::CoroTask ReorganizationPlannerUtility::process( const auto& line = *line_opt; if (line.content.empty()) continue; + const char* begin = line.content.data(); + const char* end = begin + line.content.size(); + while (begin < end && + std::isspace(static_cast(*begin))) { + ++begin; + } + while (end > begin && + std::isspace(static_cast(*(end - 1)))) { + --end; + } + if (begin == end || *begin != '{' || *(end - 1) != '}') { + continue; + } + yyjson_doc* doc = - yyjson_read(line.content.data(), line.content.size(), + yyjson_read(begin, static_cast(end - begin), YYJSON_READ_NOFLAG); if (!doc) continue; @@ -251,43 +263,49 @@ coro::CoroTask ReorganizationPlannerUtility::process( continue; } - try { - JsonValue json(root); - std::string_view ph = json["ph"].get(); - auto line_num = - static_cast(line.line_number); - - if (ph == "M") { - meta_line_numbers.push_back(line_num); - } else { - std::string cat_str( - json["cat"].get()); - std::string name_str( - json["name"].get()); - - bool matched = false; - for (std::size_t gi = 0; gi < parsed_queries.size(); - ++gi) { - const auto& q = parsed_queries[gi]; - if (!q) continue; - common::query::ValueMap fields = { - {"cat", cat_str}, {"name", name_str}}; - if (q->evaluate(fields)) { - group_lines[plan.groups[gi].name].push_back( - line_num); - matched = true; - break; - } - } - if (!matched) { - group_lines[remainder_name].push_back(line_num); - } - plan.total_events++; + auto line_num = static_cast(line.line_number); + yyjson_val* ph_val = yyjson_obj_get(root, "ph"); + const bool is_metadata = + ph_val && yyjson_is_str(ph_val) && + std::string_view(yyjson_get_str(ph_val), + yyjson_get_len(ph_val)) == "M"; + + if (is_metadata) { + meta_line_numbers.push_back(line_num); + yyjson_doc_free(doc); + continue; + } + + std::string cat_str; + if (yyjson_val* cat_val = yyjson_obj_get(root, "cat"); + cat_val && yyjson_is_str(cat_val)) { + cat_str.assign(yyjson_get_str(cat_val), + yyjson_get_len(cat_val)); + } + + std::string name_str; + if (yyjson_val* name_val = yyjson_obj_get(root, "name"); + name_val && yyjson_is_str(name_val)) { + name_str.assign(yyjson_get_str(name_val), + yyjson_get_len(name_val)); + } + + bool matched = false; + for (std::size_t gi = 0; gi < parsed_queries.size(); ++gi) { + const auto& q = parsed_queries[gi]; + if (!q) continue; + common::query::ValueMap fields = {{"cat", cat_str}, + {"name", name_str}}; + if (q->evaluate(fields)) { + group_lines[plan.groups[gi].name].push_back(line_num); + matched = true; + break; } - } catch (const std::exception&) { - // Skip malformed or partial events without - // aborting the entire plan. } + if (!matched) { + group_lines[remainder_name].push_back(line_num); + } + plan.total_events++; yyjson_doc_free(doc); } diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp index d0398692..8664442d 100644 --- a/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp @@ -106,7 +106,7 @@ coro::CoroTask ChunkDetailScannerUtility::process( // Create reader (same pattern as chunk_indexer_utility.cpp) auto reader_input = composites::IndexedReadInput::from_file(input.file_path) .with_checkpoint_size(input.checkpoint_size) - .with_index(input.idx_path); + .with_index(input.index_path); composites::IndexedFileReaderUtility reader_utility; auto reader = co_await reader_utility.process(reader_input); diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp index 73fe33ef..6f34cdd0 100644 --- a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp @@ -1,7 +1,6 @@ #include -#include +#include #include -#include #include #include #include @@ -21,25 +20,25 @@ coro::CoroTask StatisticsAggregatorUtility::process( TraceStatistics result; result.file_path = input.file_path; - if (!input.idx_path.empty()) { - result.idx_path = input.idx_path; + if (!input.index_path.empty()) { + result.index_path = + indexer::internal::normalize_index_root(input.index_path); } else { - result.idx_path = + result.index_path = internal::determine_index_path(input.file_path, input.index_dir); } - if (!fs::exists(result.idx_path)) { + if (!fs::exists(result.index_path)) { result.success = false; - result.error_message = "Index file not found: " + result.idx_path; + result.error_message = "Index store not found: " + result.index_path; co_return result; } bool needs_streaming_fallback = false; - auto do_query = [&input, &result, &needs_streaming_fallback]() -> TraceStatistics { try { - IndexDatabase idx_db(result.idx_path); + IndexDatabase idx_db(result.index_path); int fid = idx_db.get_file_info_id(get_logical_path(input.file_path)); @@ -50,10 +49,9 @@ coro::CoroTask StatisticsAggregatorUtility::process( return result; } - std::vector chunks; + std::vector chunks; try { - chunks = indexing::queries::query_chunk_statistics( - idx_db.sql_db(), fid); + chunks = idx_db.query_chunk_statistics(fid); } catch (const std::exception&) { needs_streaming_fallback = true; return result; @@ -70,8 +68,7 @@ coro::CoroTask StatisticsAggregatorUtility::process( result.merged.merge_from(chunks[i].stats); } - auto dim_stats = indexing::queries::query_chunk_dimension_stats( - idx_db.sql_db(), fid); + auto dim_stats = idx_db.query_chunk_dimension_stats(fid); for (const auto& ds : dim_stats) { if (!ds.value_counts) continue; if (ds.dimension == "cat") { @@ -94,7 +91,7 @@ coro::CoroTask StatisticsAggregatorUtility::process( return result; }; - result = co_await sqlite::run(do_query); + result = co_await rocksdb::run(do_query); if (!needs_streaming_fallback) { co_return result; diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp index b8c45c0a..584c3a4c 100644 --- a/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp +++ b/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp @@ -57,7 +57,7 @@ std::string TraceStatistics::to_json() const { yyjson_mut_doc_set_root(doc, root); yyjson_mut_obj_add_str(doc, root, "file_path", file_path.c_str()); - yyjson_mut_obj_add_str(doc, root, "idx_path", idx_path.c_str()); + yyjson_mut_obj_add_str(doc, root, "index_path", index_path.c_str()); yyjson_mut_obj_add_bool(doc, root, "success", success); if (!success) { diff --git a/src/dftracer/utils/utilities/composites/dft/views/view_builder_utility.cpp b/src/dftracer/utils/utilities/composites/dft/views/view_builder_utility.cpp index 96ca0dbc..8955bc3a 100644 --- a/src/dftracer/utils/utilities/composites/dft/views/view_builder_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/views/view_builder_utility.cpp @@ -26,8 +26,8 @@ ViewBuilderInput& ViewBuilderInput::with_file_path(const std::string& path) { return *this; } -ViewBuilderInput& ViewBuilderInput::with_idx_path(const std::string& path) { - idx_path = path; +ViewBuilderInput& ViewBuilderInput::with_index_path(const std::string& path) { + index_path = path; return *this; } @@ -62,10 +62,10 @@ coro::CoroTask ViewBuilderUtility::process( std::vector candidate_checkpoints; - if (input.view.query && !input.idx_path.empty()) { - indexing::ChunkPrunerInput pruner_input{input.idx_path, input.file_path, - *input.view.query, - input.bloom_cache}; + if (input.view.query && !input.index_path.empty()) { + indexing::ChunkPrunerInput pruner_input{ + input.index_path, input.file_path, *input.view.query, + input.bloom_cache}; indexing::ChunkPrunerUtility pruner; auto pruner_output = co_await pruner.process(pruner_input); @@ -96,18 +96,16 @@ coro::CoroTask ViewBuilderUtility::process( // Chunk-level time range skip: query per-chunk time bounds from // the bloom index and remove chunks that don't overlap the query. - if (input.time_range && !input.idx_path.empty() && + if (input.time_range && !input.index_path.empty() && !candidate_checkpoints.empty()) { auto [t_begin, t_end] = *input.time_range; if (t_begin > 0 || t_end > 0) { try { - IndexDatabase idx_db(input.idx_path); + IndexDatabase idx_db(input.index_path); int fid = idx_db.get_file_info_id(get_logical_path(input.file_path)); if (fid >= 0) { - auto chunk_stats = - indexing::queries::query_chunk_statistics( - idx_db.sql_db(), fid); + auto chunk_stats = idx_db.query_chunk_statistics(fid); std::unordered_map> diff --git a/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp b/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp index b3c360f0..5759ff4e 100644 --- a/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp +++ b/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp @@ -23,8 +23,8 @@ ViewReaderInput& ViewReaderInput::with_file_path(const std::string& path) { return *this; } -ViewReaderInput& ViewReaderInput::with_idx_path(const std::string& path) { - idx_path = path; +ViewReaderInput& ViewReaderInput::with_index_path(const std::string& path) { + index_path = path; return *this; } @@ -117,7 +117,7 @@ coro::AsyncGenerator ViewReaderUtility::process( emitted_hashes; auto reader_input = composites::IndexedReadInput::from_file(input.file_path) - .with_index(input.idx_path); + .with_index(input.index_path); if (input.checkpoint_size > 0) { reader_input.with_checkpoint_size(input.checkpoint_size); } diff --git a/src/dftracer/utils/utilities/composites/file_merger_utility.cpp b/src/dftracer/utils/utilities/composites/file_merger_utility.cpp index 6417f334..b86c8e82 100644 --- a/src/dftracer/utils/utilities/composites/file_merger_utility.cpp +++ b/src/dftracer/utils/utilities/composites/file_merger_utility.cpp @@ -31,7 +31,7 @@ FileMergeValidatorUtility::process( (input.file_path.size() >= 3 && input.file_path.substr(input.file_path.size() - 3) == ".gz"); - std::string effective_idx_path = input.index_path; + std::string effective_index_path = input.index_path; if (is_compressed) { // Use IndexBuilderUtility for compressed files @@ -53,7 +53,7 @@ FileMergeValidatorUtility::process( co_return result; } // Use the actual idx path produced by the builder - effective_idx_path = index_result.idx_path; + effective_index_path = index_result.index_path; } // Step 2: Create line processor function that validates JSON @@ -79,7 +79,7 @@ FileMergeValidatorUtility::process( fileio::lines::LineReadInput read_input; read_input.file_path = input.file_path; if (is_compressed) { - read_input.idx_path = effective_idx_path; + read_input.index_path = effective_index_path; } auto validated_events = co_await processor.process(read_input); @@ -110,7 +110,7 @@ FileMergeValidatorUtility::process( if (is_compressed) { auto reader = dftracer::utils::utilities::reader::internal:: - ReaderFactory::create(input.file_path, effective_idx_path); + ReaderFactory::create(input.file_path, effective_index_path); if (reader) { result.total_lines = reader->get_num_lines(); } diff --git a/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp b/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp index b851efa6..97b15098 100644 --- a/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp +++ b/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -8,6 +9,7 @@ #include #include #include +#include #include #include @@ -18,6 +20,7 @@ namespace dftracer::utils::utilities::indexer { using composites::dft::internal::determine_index_path; using internal::IndexerFactory; +namespace rocks = dftracer::utils::rocksdb; // --------------------------------------------------------------------------- // IndexBuildConfig builder methods @@ -78,9 +81,9 @@ coro::CoroTask IndexBuilderUtility::process( result.file_path = config.file_path; try { - std::string idx_path = + std::string index_path = determine_index_path(config.file_path, config.index_dir); - result.idx_path = idx_path; + result.index_path = index_path; // Check compressed file size against threshold (0 = always index). std::uintmax_t file_sz = 0; @@ -93,7 +96,7 @@ coro::CoroTask IndexBuilderUtility::process( auto build_start = std::chrono::steady_clock::now(); auto indexer = IndexerFactory::create( - config.file_path, idx_path, + config.file_path, index_path, static_cast(config.checkpoint_size), config.force_rebuild); @@ -113,7 +116,9 @@ coro::CoroTask IndexBuilderUtility::process( auto logical = internal::get_logical_path(config.file_path); bool bloom_ok = !config.build_bloom || [&] { try { - IndexDatabase db(idx_path); + IndexDatabase db(index_path, + dftracer::utils::rocksdb::RocksDatabase:: + OpenMode::ReadOnly); int fid = db.get_file_info_id(logical); return fid >= 0 && db.has_bloom_data(fid); } catch (...) { @@ -122,7 +127,9 @@ coro::CoroTask IndexBuilderUtility::process( }(); bool manifest_ok = !config.build_manifest || [&] { try { - IndexDatabase db(idx_path); + IndexDatabase db(index_path, + dftracer::utils::rocksdb::RocksDatabase:: + OpenMode::ReadOnly); int fid = db.get_file_info_id(logical); return fid >= 0 && db.has_manifest_data(fid); } catch (...) { @@ -202,33 +209,37 @@ coro::CoroTask IndexBuilderUtility::process( result.chunks_processed = static_cast(indexer->get_checkpoints().size()); - // Persist visitor data into the .idx database only when the file meets - // the size threshold (or threshold is disabled). + // Persist visitor data into the `.dftindex` store only when the file + // meets the size threshold (or threshold is disabled). if (!below_threshold && (config.build_bloom || config.build_manifest)) { - const std::string& built_idx = indexer->get_idx_path(); + const std::string& built_index_path = indexer->get_index_path(); - IndexDatabase db(built_idx); - - auto logical = internal::get_logical_path(config.file_path); - int fid = db.get_file_info_id(logical); - if (fid < 0) { - result.error_message = - "File not found in index after build: " + logical; - co_return result; - } - - db.begin_transaction(); try { - if (config.build_bloom && bloom_visitor) { - db.init_bloom_schema(); - db.delete_chunk_statistics(fid); - bloom_visitor->finalize(db, fid); - } - if (config.build_manifest && manifest_visitor) { - db.init_manifest_schema(); - manifest_visitor->finalize(db, fid); - } - db.commit_transaction(); + IndexDatabase db(built_index_path); + auto logical = internal::get_logical_path(config.file_path); + const auto hash = + internal::calculate_file_hash(config.file_path); + auto* db_ptr = &db; + auto* logical_ptr = &logical; + auto* config_ptr = &config; + auto* bloom_visitor_ptr = &bloom_visitor; + auto* manifest_visitor_ptr = &manifest_visitor; + co_await rocks::run([db_ptr, logical_ptr, hash, config_ptr, + bloom_visitor_ptr, manifest_visitor_ptr] { + int fid = + db_ptr->get_or_create_file_info(*logical_ptr, hash); + internal::TransactionScope txn(*db_ptr); + if (config_ptr->build_bloom && *bloom_visitor_ptr) { + db_ptr->init_bloom_schema(); + db_ptr->delete_chunk_statistics(fid); + (*bloom_visitor_ptr)->finalize(*db_ptr, fid); + } + if (config_ptr->build_manifest && *manifest_visitor_ptr) { + db_ptr->init_manifest_schema(); + (*manifest_visitor_ptr)->finalize(*db_ptr, fid); + } + txn.commit(); + }); } catch (const std::exception& e) { result.error_message = std::string("Failed to persist index data: ") + e.what(); diff --git a/src/dftracer/utils/utilities/indexer/index_database.cpp b/src/dftracer/utils/utilities/indexer/index_database.cpp index c74a91c3..4cc3165c 100644 --- a/src/dftracer/utils/utilities/indexer/index_database.cpp +++ b/src/dftracer/utils/utilities/indexer/index_database.cpp @@ -1,310 +1,773 @@ -#include +#include +#include +#include #include -#include #include #include #include +#include + +#include +#include +#include +#include +#include +#include +#include namespace dftracer::utils::utilities::indexer { namespace queries = composites::dft::indexing::queries; +namespace rocks = dftracer::utils::rocksdb; -using dftracer::utils::sqlite::SqliteStmt; using internal::IndexerError; -// --------------------------------------------------------------------------- -// Schema strings -// --------------------------------------------------------------------------- - -// Matches GzipIndexer schema (gzip/constants.cpp) so IndexDatabase -// can open .idx files created by the existing indexer. -static const char* BASE_SCHEMA = R"( - PRAGMA journal_mode=WAL; - PRAGMA busy_timeout=5000; - PRAGMA foreign_keys=ON; - - CREATE TABLE IF NOT EXISTS files ( - id INTEGER PRIMARY KEY, - logical_name TEXT UNIQUE NOT NULL, - byte_size INTEGER NOT NULL DEFAULT 0, - mtime_unix INTEGER NOT NULL DEFAULT 0, - hash INTEGER NOT NULL DEFAULT 0 - ); - - CREATE TABLE IF NOT EXISTS checkpoints ( - id INTEGER PRIMARY KEY, - file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, - checkpoint_idx INTEGER NOT NULL, - uc_offset INTEGER NOT NULL DEFAULT 0, - uc_size INTEGER NOT NULL DEFAULT 0, - c_offset INTEGER NOT NULL DEFAULT 0, - c_size INTEGER NOT NULL DEFAULT 0, - bits INTEGER NOT NULL DEFAULT 0, - dict_compressed BLOB, - num_lines INTEGER NOT NULL DEFAULT 0, - first_line_num INTEGER NOT NULL DEFAULT 0, - last_line_num INTEGER NOT NULL DEFAULT 0 - ); - - CREATE INDEX IF NOT EXISTS checkpoints_file_idx - ON checkpoints(file_id, checkpoint_idx); - CREATE INDEX IF NOT EXISTS checkpoints_file_uc_off_idx - ON checkpoints(file_id, uc_offset); - CREATE INDEX IF NOT EXISTS checkpoints_line_range_idx - ON checkpoints(file_id, first_line_num, last_line_num); - - CREATE TABLE IF NOT EXISTS metadata ( - file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, - checkpoint_size INTEGER NOT NULL DEFAULT 0, - total_lines INTEGER NOT NULL DEFAULT 0, - total_uc_size INTEGER NOT NULL DEFAULT 0, - PRIMARY KEY(file_id) - ); -)"; - -static const char* BLOOM_SCHEMA = R"( - CREATE TABLE IF NOT EXISTS chunk_bloom_filters ( - id INTEGER PRIMARY KEY, - file_info_id INTEGER NOT NULL, - checkpoint_idx INTEGER NOT NULL, - dimension TEXT NOT NULL, - bloom_data BLOB NOT NULL, - num_entries INTEGER NOT NULL, - UNIQUE(file_info_id, checkpoint_idx, dimension) - ); - - CREATE TABLE IF NOT EXISTS file_bloom_filters ( - id INTEGER PRIMARY KEY, - file_info_id INTEGER NOT NULL, - dimension TEXT NOT NULL, - bloom_data BLOB NOT NULL, - num_entries INTEGER NOT NULL, - UNIQUE(file_info_id, dimension) - ); - - CREATE TABLE IF NOT EXISTS chunk_statistics ( - id INTEGER PRIMARY KEY, - file_info_id INTEGER NOT NULL, - checkpoint_idx INTEGER NOT NULL, - total_events INTEGER NOT NULL DEFAULT 0, - min_timestamp_us INTEGER, - max_timestamp_us INTEGER, - duration_sum_us INTEGER NOT NULL DEFAULT 0, - duration_min_us INTEGER, - duration_max_us INTEGER, - duration_count INTEGER NOT NULL DEFAULT 0, - duration_m2 REAL NOT NULL DEFAULT 0, - duration_sketch BLOB, - duration_histogram TEXT NOT NULL DEFAULT '[]', - name_duration_sketches BLOB, - name_duration_histograms TEXT NOT NULL DEFAULT '{}', - name_duration_sums TEXT NOT NULL DEFAULT '{}', - name_duration_sum_sqs TEXT NOT NULL DEFAULT '{}', - name_category TEXT NOT NULL DEFAULT '{}', - UNIQUE(file_info_id, checkpoint_idx) - ); - - CREATE TABLE IF NOT EXISTS index_dimensions ( - id INTEGER PRIMARY KEY, - file_info_id INTEGER NOT NULL, - dimension TEXT NOT NULL, - UNIQUE(file_info_id, dimension) - ); - - CREATE TABLE IF NOT EXISTS hash_resolutions ( - id INTEGER PRIMARY KEY, - file_info_id INTEGER NOT NULL, - dimension TEXT NOT NULL, - hash_value TEXT NOT NULL, - resolved_value TEXT NOT NULL, - UNIQUE(file_info_id, dimension, hash_value) - ); - - CREATE TABLE IF NOT EXISTS chunk_dimension_stats ( - id INTEGER PRIMARY KEY, - file_info_id INTEGER NOT NULL, - checkpoint_idx INTEGER NOT NULL, - dimension TEXT NOT NULL, - distinct_count INTEGER NOT NULL DEFAULT 0, - value_counts BLOB, - min_value TEXT, - max_value TEXT, - value_type TEXT NOT NULL DEFAULT 'string', - UNIQUE(file_info_id, checkpoint_idx, dimension) - ); - - CREATE INDEX IF NOT EXISTS chunk_bloom_file_dim_idx - ON chunk_bloom_filters(file_info_id, dimension); - CREATE INDEX IF NOT EXISTS chunk_stats_file_idx - ON chunk_statistics(file_info_id, checkpoint_idx); - CREATE INDEX IF NOT EXISTS chunk_dim_stats_file_dim_idx - ON chunk_dimension_stats(file_info_id, dimension); - CREATE INDEX IF NOT EXISTS hash_res_dim_val_idx - ON hash_resolutions(dimension, resolved_value); -)"; - -static const char* MANIFEST_SCHEMA = R"( - CREATE TABLE IF NOT EXISTS checkpoint_event_ranges ( - checkpoint_idx INTEGER NOT NULL, - file_info_id INTEGER NOT NULL, - cat TEXT NOT NULL, - name TEXT NOT NULL, - line_numbers BLOB NOT NULL, - event_count INTEGER NOT NULL DEFAULT 0, - PRIMARY KEY (file_info_id, checkpoint_idx, cat, name) - ); - - CREATE TABLE IF NOT EXISTS checkpoint_metadata_lines ( - checkpoint_idx INTEGER NOT NULL, - file_info_id INTEGER NOT NULL, - meta_type TEXT NOT NULL, - line_numbers BLOB NOT NULL, - PRIMARY KEY (file_info_id, checkpoint_idx, meta_type) - ); - - CREATE INDEX IF NOT EXISTS idx_event_ranges_checkpoint - ON checkpoint_event_ranges(file_info_id, checkpoint_idx); - CREATE INDEX IF NOT EXISTS idx_metadata_checkpoint - ON checkpoint_metadata_lines(file_info_id, checkpoint_idx); -)"; - -// --------------------------------------------------------------------------- -// Constructor / destructor -// --------------------------------------------------------------------------- - -IndexDatabase::IndexDatabase(const std::string& idx_path) : db_(idx_path) {} - -// --------------------------------------------------------------------------- -// Schema initialisation -// --------------------------------------------------------------------------- - -static void exec_schema(sqlite3* db, const char* sql, const char* label) { - char* err_msg = nullptr; - int rc = sqlite3_exec(db, sql, nullptr, nullptr, &err_msg); - if (rc != SQLITE_OK) { - std::string error = err_msg ? std::string(err_msg) : "unknown error"; - if (err_msg) sqlite3_free(err_msg); - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - std::string(label) + ": " + error); +namespace { + +constexpr std::uint32_t kSchemaVersion = 1; + +[[noreturn]] void throw_db_error(std::string_view message, + const ::rocksdb::Status& status) { + throw IndexerError(IndexerError::Type::DATABASE_ERROR, + std::string(message) + ": " + status.ToString()); +} + +void append_u8(std::string& out, std::uint8_t value) { + out.push_back(static_cast(value)); +} + +void append_i64(std::string& out, std::int64_t value) { + rocks::KeyCodec::append_be64(out, static_cast(value)); +} + +void append_u64(std::string& out, std::uint64_t value) { + rocks::KeyCodec::append_be64(out, value); +} + +void append_double(std::string& out, double value) { + static_assert(sizeof(double) == sizeof(std::uint64_t)); + std::uint64_t bits = 0; + std::memcpy(&bits, &value, sizeof(bits)); + append_u64(out, bits); +} + +void append_string(std::string& out, std::string_view value) { + rocks::KeyCodec::append_be32(out, static_cast(value.size())); + out.append(value.data(), value.size()); +} + +void append_blob(std::string& out, std::span blob) { + rocks::KeyCodec::append_be32(out, static_cast(blob.size())); + out.append(reinterpret_cast(blob.data()), blob.size()); +} + +class Cursor { + public: + explicit Cursor(std::string_view data) : data_(data) {} + + std::uint8_t u8() { return static_cast(take(1)[0]); } + + std::uint32_t u32() { return rocks::KeyCodec::decode_be32(take(4)); } + + std::uint64_t u64() { return rocks::KeyCodec::decode_be64(take(8)); } + + std::int64_t i64() { return static_cast(u64()); } + + double f64() { + std::uint64_t bits = u64(); + double value = 0.0; + std::memcpy(&value, &bits, sizeof(value)); + return value; + } + + std::string str() { + auto len = static_cast(u32()); + auto bytes = take(len); + return std::string(bytes.data(), bytes.size()); + } + + std::vector blob() { + auto len = static_cast(u32()); + auto bytes = take(len); + return std::vector(bytes.begin(), bytes.end()); + } + + private: + std::string_view take(std::size_t len) { + if (offset_ + len > data_.size()) { + throw std::runtime_error("Corrupt RocksDB payload"); + } + auto chunk = data_.substr(offset_, len); + offset_ += len; + return chunk; } + + std::string_view data_; + std::size_t offset_ = 0; +}; + +std::string file_lookup_key(std::string_view logical_name) { + return std::string("f|") + std::string(logical_name); } -void IndexDatabase::init_base_schema() { - exec_schema(db_.get(), BASE_SCHEMA, "init_base_schema"); +std::string file_reverse_key(int file_id) { + std::string key("r|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + return key; } -void IndexDatabase::init_bloom_schema() { - exec_schema(db_.get(), BLOOM_SCHEMA, "init_bloom_schema"); +std::string next_file_id_key() { return "_next_file_id"; } +std::string schema_version_key() { return "_schema_version"; } + +std::string encode_file_record(int file_id, std::uint64_t file_hash) { + std::string value; + rocks::KeyCodec::append_be32(value, static_cast(file_id)); + append_u64(value, 0); + append_u64(value, 0); + append_u64(value, file_hash); + return value; } -void IndexDatabase::init_manifest_schema() { - exec_schema(db_.get(), MANIFEST_SCHEMA, "init_manifest_schema"); +int decode_file_id(std::string_view record) { + if (record.size() < 4) { + throw std::runtime_error("Corrupt file record"); + } + return static_cast(rocks::KeyCodec::decode_be32(record.substr(0, 4))); +} + +std::uint64_t decode_file_hash(std::string_view record) { + if (record.size() < 28) { + throw std::runtime_error("Corrupt file record"); + } + return rocks::KeyCodec::decode_be64(record.substr(20, 8)); +} + +std::string prefix_for_file(int file_id) { + return rocks::KeyCodec::encode_be32(static_cast(file_id)); +} + +std::string make_hash_owner_key(int file_id, std::string_view dimension, + std::string_view hash_value) { + std::string key("o|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + key.push_back('\0'); + key.append(dimension); + key.push_back('\0'); + key.append(hash_value); + return key; +} + +std::string make_hash_forward_key(std::string_view dimension, + std::string_view hash_value) { + std::string key("h|"); + key.append(dimension); + key.push_back('\0'); + key.append(hash_value); + return key; +} + +std::string make_hash_reverse_key(std::string_view dimension, + std::string_view resolved_value, + std::string_view hash_value) { + std::string key("H|"); + key.append(dimension); + key.push_back('\0'); + key.append(resolved_value); + key.push_back('\0'); + key.append(hash_value); + return key; +} + +std::string make_dimension_key(int file_id, std::string_view dimension) { + std::string key("d|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + key.append(dimension); + return key; +} + +std::string chunk_bloom_key(int file_id, std::string_view dimension, + std::uint64_t checkpoint_idx) { + std::string key = prefix_for_file(file_id); + key.append(dimension); + key.push_back('\0'); + append_u64(key, checkpoint_idx); + return key; +} + +std::string file_bloom_key(int file_id, std::string_view dimension) { + std::string key = prefix_for_file(file_id); + key.append(dimension); + return key; +} + +std::string chunk_stats_key(int file_id, std::uint64_t checkpoint_idx) { + std::string key = prefix_for_file(file_id); + append_u64(key, checkpoint_idx); + return key; +} + +std::string checkpoint_key(int file_id, std::uint64_t uc_offset, + std::uint64_t checkpoint_idx) { + std::string key = prefix_for_file(file_id); + append_u64(key, uc_offset); + append_u64(key, checkpoint_idx); + return key; +} + +std::string chunk_dim_stats_key(int file_id, std::uint64_t checkpoint_idx, + std::string_view dimension) { + std::string key = prefix_for_file(file_id); + append_u64(key, checkpoint_idx); + key.append(dimension); + return key; +} + +std::string manifest_event_key(int file_id, std::uint64_t checkpoint_idx, + std::string_view cat, std::string_view name) { + std::string key("E|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + append_u64(key, checkpoint_idx); + key.append(cat); + key.push_back('\0'); + key.append(name); + return key; +} + +std::string manifest_metadata_key(int file_id, std::uint64_t checkpoint_idx, + std::string_view meta_type) { + std::string key("M|"); + rocks::KeyCodec::append_be32(key, static_cast(file_id)); + append_u64(key, checkpoint_idx); + key.append(meta_type); + return key; +} + +std::string metadata_key(int file_id) { return prefix_for_file(file_id); } + +std::string tar_archive_key(int file_id) { return prefix_for_file(file_id); } + +std::string tar_file_key(int file_id, std::uint64_t uncompressed_offset, + std::string_view file_name) { + std::string key = prefix_for_file(file_id); + append_u64(key, uncompressed_offset); + key.push_back('\0'); + key.append(file_name); + return key; +} + +std::string encode_bloom_value(std::span blob, + std::uint64_t num_entries) { + std::string value; + append_u64(value, num_entries); + value.append(reinterpret_cast(blob.data()), blob.size()); + return value; +} + +IndexDatabase::ChunkBloomResult decode_chunk_bloom(std::string_view key, + std::string_view value, + std::size_t prefix_size) { + IndexDatabase::ChunkBloomResult result; + auto checkpoint_pos = key.find('\0', prefix_size); + if (checkpoint_pos == std::string_view::npos || + checkpoint_pos + 1 + 8 > key.size()) { + throw std::runtime_error("Corrupt chunk bloom key"); + } + result.checkpoint_idx = + rocks::KeyCodec::decode_be64(key.substr(checkpoint_pos + 1, 8)); + if (value.size() < 8) { + throw std::runtime_error("Corrupt chunk bloom value"); + } + result.num_entries = rocks::KeyCodec::decode_be64(value.substr(0, 8)); + result.bloom_data.assign(value.begin() + 8, value.end()); + return result; +} + +IndexDatabase::FileBloomResult decode_file_bloom(std::string_view value) { + if (value.size() < 8) { + throw std::runtime_error("Corrupt file bloom value"); + } + IndexDatabase::FileBloomResult result; + result.num_entries = rocks::KeyCodec::decode_be64(value.substr(0, 8)); + result.bloom_data.assign(value.begin() + 8, value.end()); + return result; +} + +std::string encode_chunk_statistics_value( + const IndexDatabase::ChunkStatistics& stats) { + std::string value; + append_u64(value, stats.total_events); + append_u64(value, stats.min_timestamp_us); + append_u64(value, stats.max_timestamp_us); + append_i64(value, stats.duration_sum_us); + append_u64(value, stats.duration_min_us); + append_u64(value, stats.duration_max_us); + append_u64(value, stats.duration_count); + append_double(value, stats.duration_m2); + + auto duration_sketch = stats.duration_sketch.serialize(); + append_blob(value, duration_sketch); + + auto duration_histogram = stats.duration_histogram.to_json(); + append_string(value, duration_histogram); + + auto name_sketches = stats.serialize_name_duration_sketches(); + append_blob(value, name_sketches); + append_string(value, stats.name_duration_histograms_json()); + append_string(value, stats.name_duration_sums_json()); + append_string(value, stats.name_duration_sum_sqs_json()); + append_string(value, stats.name_category_json()); + return value; +} + +IndexDatabase::ChunkStatistics decode_chunk_statistics_value( + std::string_view value) { + Cursor cursor(value); + IndexDatabase::ChunkStatistics stats; + stats.total_events = cursor.u64(); + stats.min_timestamp_us = cursor.u64(); + stats.max_timestamp_us = cursor.u64(); + stats.duration_sum_us = cursor.i64(); + stats.duration_min_us = cursor.u64(); + stats.duration_max_us = cursor.u64(); + stats.duration_count = cursor.u64(); + stats.duration_m2 = cursor.f64(); + + auto duration_sketch = cursor.blob(); + if (!duration_sketch.empty()) { + stats.duration_sketch = common::statistics::DDSketch::deserialize( + duration_sketch.data(), duration_sketch.size()); + } + + auto duration_histogram = cursor.str(); + if (!duration_histogram.empty()) { + stats.duration_histogram = + common::statistics::Log2Histogram::from_json(duration_histogram); + } + + auto name_sketches = cursor.blob(); + if (!name_sketches.empty()) { + stats.name_duration_sketches = + IndexDatabase::ChunkStatistics::deserialize_name_duration_sketches( + name_sketches.data(), name_sketches.size()); + } + + stats.name_duration_histograms = + IndexDatabase::ChunkStatistics::parse_histogram_map_json(cursor.str()); + stats.name_duration_sums = + IndexDatabase::ChunkStatistics::parse_double_map_json(cursor.str()); + stats.name_duration_sum_sqs = + IndexDatabase::ChunkStatistics::parse_double_map_json(cursor.str()); + stats.name_category = + IndexDatabase::ChunkStatistics::parse_string_map_json(cursor.str()); + return stats; +} + +std::string encode_checkpoint_value( + const IndexDatabase::IndexerCheckpoint& checkpoint) { + std::string value; + append_u64(value, checkpoint.uc_size); + append_u64(value, checkpoint.c_offset); + append_u64(value, checkpoint.c_size); + append_i64(value, checkpoint.bits); + append_blob(value, checkpoint.dict_compressed); + append_u64(value, checkpoint.num_lines); + append_u64(value, checkpoint.first_line_num); + append_u64(value, checkpoint.last_line_num); + return value; +} + +IndexDatabase::IndexerCheckpoint decode_checkpoint(std::string_view key, + std::string_view value) { + if (key.size() < 20) { + throw std::runtime_error("Corrupt checkpoint key"); + } + + IndexDatabase::IndexerCheckpoint checkpoint; + checkpoint.uc_offset = rocks::KeyCodec::decode_be64(key.substr(4, 8)); + checkpoint.checkpoint_idx = rocks::KeyCodec::decode_be64(key.substr(12, 8)); + + Cursor cursor(value); + checkpoint.uc_size = cursor.u64(); + checkpoint.c_offset = cursor.u64(); + checkpoint.c_size = cursor.u64(); + checkpoint.bits = static_cast(cursor.i64()); + checkpoint.dict_compressed = cursor.blob(); + checkpoint.num_lines = cursor.u64(); + checkpoint.first_line_num = cursor.u64(); + checkpoint.last_line_num = cursor.u64(); + return checkpoint; +} + +std::string encode_chunk_dimension_stats_value( + const IndexDatabase::ChunkDimensionStats& stats, + std::size_t value_counts_cap) { + std::string value; + append_u64(value, stats.distinct_count); + append_string(value, stats.min_value); + append_string(value, stats.max_value); + append_string(value, stats.value_type); + auto compressed = stats.compress_value_counts(value_counts_cap); + append_u8(value, compressed.has_value() ? 1 : 0); + if (compressed) { + append_blob(value, *compressed); + } + return value; +} + +IndexDatabase::ChunkDimensionStatsResult decode_chunk_dimension_stats_value( + std::string_view key, std::string_view value) { + IndexDatabase::ChunkDimensionStatsResult result; + if (key.size() < 12) { + throw std::runtime_error("Corrupt chunk dimension stats key"); + } + result.checkpoint_idx = rocks::KeyCodec::decode_be64(key.substr(4, 8)); + result.dimension = std::string(key.substr(12)); + + Cursor cursor(value); + result.distinct_count = cursor.u64(); + result.min_value = cursor.str(); + result.max_value = cursor.str(); + result.value_type = cursor.str(); + if (cursor.u8() != 0) { + auto compressed = cursor.blob(); + result.value_counts = + IndexDatabase::ChunkDimensionStats::decompress_value_counts( + compressed.data(), compressed.size()); + } + return result; +} + +std::string encode_event_range_value(std::span lines) { + std::vector vec(lines.begin(), lines.end()); + auto blob = queries::pack_line_numbers(vec); + std::string value; + append_u64(value, vec.size()); + append_blob(value, blob); + return value; +} + +std::vector decode_line_numbers(Cursor& cursor) { + auto blob = cursor.blob(); + return queries::unpack_line_numbers(blob.data(), blob.size()); +} + +std::string encode_metadata_value(std::span lines) { + std::vector vec(lines.begin(), lines.end()); + auto blob = queries::pack_line_numbers(vec); + std::string value; + append_blob(value, blob); + return value; +} + +std::string encode_metadata_record(std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size) { + std::string value; + append_u64(value, checkpoint_size); + append_u64(value, total_lines); + append_u64(value, total_uc_size); + return value; +} + +std::string encode_tar_archive_value(std::string_view archive_name, + std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size, + std::uint64_t total_files) { + std::string value; + append_string(value, archive_name); + append_u64(value, checkpoint_size); + append_u64(value, total_lines); + append_u64(value, total_uc_size); + append_u64(value, total_files); + return value; +} + +IndexDatabase::TarArchiveMetadata decode_tar_archive_value( + std::string_view value) { + Cursor cursor(value); + IndexDatabase::TarArchiveMetadata metadata; + metadata.archive_name = cursor.str(); + metadata.checkpoint_size = cursor.u64(); + metadata.total_lines = cursor.u64(); + metadata.total_uc_size = cursor.u64(); + metadata.total_files = cursor.u64(); + return metadata; +} + +std::string encode_tar_file_value(const IndexDatabase::TarFileRecord& record) { + std::string value; + append_u64(value, record.file_size); + append_u64(value, record.file_mtime); + append_u8(value, static_cast(record.typeflag)); + append_u64(value, record.data_offset); + return value; +} + +IndexDatabase::TarFileRecord decode_tar_file(std::string_view key, + std::string_view value) { + if (key.size() < 13) { + throw std::runtime_error("Corrupt tar file key"); + } + + const auto name_pos = key.find('\0', 12); + if (name_pos == std::string_view::npos) { + throw std::runtime_error("Corrupt tar file key"); + } + + Cursor cursor(value); + IndexDatabase::TarFileRecord record; + record.uncompressed_offset = rocks::KeyCodec::decode_be64(key.substr(4, 8)); + record.file_name = std::string(key.substr(name_pos + 1)); + record.file_size = cursor.u64(); + record.file_mtime = cursor.u64(); + record.typeflag = static_cast(cursor.u8()); + record.data_offset = cursor.u64(); + return record; +} + +std::array decode_metadata_record(std::string_view value) { + Cursor cursor(value); + return {cursor.u64(), cursor.u64(), cursor.u64()}; +} + +std::string iterator_value(::rocksdb::Iterator& it) { + const auto slice = it.value(); + return std::string(slice.data(), slice.size()); +} + +std::string iterator_key(::rocksdb::Iterator& it) { + const auto slice = it.key(); + return std::string(slice.data(), slice.size()); +} + +template +void scan_prefix(const rocks::RocksDatabase& db, std::string_view column_family, + std::string_view prefix, Fn&& fn) { + internal::scan_prefix_iterator( + "Failed to scan RocksDB prefix", prefix, + [&] { return db.new_iterator(column_family); }, std::forward(fn)); +} + +} // namespace + +IndexDatabase::IndexDatabase(const std::string& index_path, + rocks::RocksDatabase::OpenMode open_mode) + : db_path_(internal::normalize_index_root(index_path)), + open_mode_(open_mode), + db_(rocks::RocksDBManager::instance().get_or_open(db_path_, open_mode_)) { + if (open_mode_ == rocks::RocksDatabase::OpenMode::ReadWrite) { + init_base_schema(); + } } -// --------------------------------------------------------------------------- -// Query helpers -// --------------------------------------------------------------------------- +void IndexDatabase::init_base_schema() { + std::string value; + auto status = db_->get(schema_version_key(), &value); + if (status.IsNotFound()) { + status = db_->put(schema_version_key(), + rocks::KeyCodec::encode_be32(kSchemaVersion)); + if (!status.ok()) { + throw_db_error("Failed to initialize schema version", status); + } + } else if (!status.ok()) { + throw_db_error("Failed to read schema version", status); + } +} -// Returns true if the named table exists in the database. -static bool table_exists(sqlite3* db, const char* table_name) { - SqliteStmt stmt(db, - "SELECT 1 FROM sqlite_master " - "WHERE type='table' AND name=?;"); - stmt.bind_text(1, table_name); - return sqlite3_step(stmt) == SQLITE_ROW; +void IndexDatabase::init_bloom_schema() { + // RocksDB column families are provisioned at DB open; bloom-specific + // schema initialization is intentionally a no-op. +} + +void IndexDatabase::init_manifest_schema() { + // RocksDB column families are provisioned at DB open; manifest-specific + // schema initialization is intentionally a no-op. } bool IndexDatabase::has_bloom_data(int file_id) const { - if (!table_exists(db_.get(), "chunk_bloom_filters")) return false; - SqliteStmt stmt(db_.get(), - "SELECT 1 FROM chunk_bloom_filters " - "WHERE file_info_id=? LIMIT 1;"); - stmt.bind_int(1, file_id); - return sqlite3_step(stmt) == SQLITE_ROW; + bool found = false; + auto prefix = prefix_for_file(file_id); + scan_prefix(*db_, "chunk_bloom", prefix, + [&found](::rocksdb::Iterator&) { found = true; }); + return found; } bool IndexDatabase::has_manifest_data(int file_id) const { - if (!table_exists(db_.get(), "checkpoint_event_ranges")) return false; - SqliteStmt stmt(db_.get(), - "SELECT 1 FROM checkpoint_event_ranges " - "WHERE file_info_id=? LIMIT 1;"); - stmt.bind_int(1, file_id); - return sqlite3_step(stmt) == SQLITE_ROW; + bool found = false; + std::string prefix("E|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, "manifest", prefix, + [&found](::rocksdb::Iterator&) { found = true; }); + return found; } int IndexDatabase::get_or_create_file_info(std::string_view path, std::uint64_t file_hash) { - { - SqliteStmt stmt(db_.get(), - "SELECT id, hash FROM files WHERE logical_name=?;"); - stmt.bind_text(1, path); - if (sqlite3_step(stmt) == SQLITE_ROW) { - int id = sqlite3_column_int(stmt, 0); - auto stored = - static_cast(sqlite3_column_int64(stmt, 1)); - if (stored == file_hash) return id; - SqliteStmt del(db_.get(), "DELETE FROM files WHERE id=?;"); - del.bind_int(1, id); - sqlite3_step(del); + const auto logical_name = std::string(path); + const auto lookup = file_lookup_key(logical_name); + std::string existing; + auto status = db_->get(lookup, &existing); + if (status.ok()) { + const auto file_id = decode_file_id(existing); + if (decode_file_hash(existing) == file_hash) { + return file_id; } + delete_file_data(file_id); + auto registry = encode_file_record(file_id, file_hash); + if (txn_batch_) { + status = db_->put(*txn_batch_, "default", lookup, registry); + if (!status.ok()) { + throw_db_error("Failed to update file registry", status); + } + status = db_->put(*txn_batch_, "default", file_reverse_key(file_id), + logical_name); + if (!status.ok()) { + throw_db_error("Failed to update reverse file registry", + status); + } + } else { + status = db_->put(lookup, registry); + if (!status.ok()) { + throw_db_error("Failed to update file registry", status); + } + status = db_->put(file_reverse_key(file_id), logical_name); + if (!status.ok()) { + throw_db_error("Failed to update reverse file registry", + status); + } + } + return file_id; + } + if (!status.IsNotFound()) { + throw_db_error("Failed to query file registry", status); } - SqliteStmt stmt( - db_.get(), - "INSERT INTO files(logical_name, byte_size, mtime_unix, hash)" - " VALUES(?, 0, 0, ?);"); - stmt.bind_text(1, path); - stmt.bind_int64(2, static_cast(file_hash)); - if (sqlite3_step(stmt) != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert into files: " + - std::string(sqlite3_errmsg(db_.get()))); + std::uint32_t next_id = 1; + std::string next_value; + status = db_->get(next_file_id_key(), &next_value); + if (status.ok()) { + next_id = rocks::KeyCodec::decode_be32(next_value); + } else if (!status.IsNotFound()) { + throw_db_error("Failed to read next file id", status); } - return static_cast(sqlite3_last_insert_rowid(db_.get())); + + const auto file_id = static_cast(next_id); + const auto new_registry = encode_file_record(file_id, file_hash); + const auto next_registry = rocks::KeyCodec::encode_be32(next_id + 1); + + if (txn_batch_) { + status = db_->put(*txn_batch_, "default", lookup, new_registry); + if (!status.ok()) { + throw_db_error("Failed to insert file registry", status); + } + status = db_->put(*txn_batch_, "default", file_reverse_key(file_id), + logical_name); + if (!status.ok()) { + throw_db_error("Failed to insert reverse file registry", status); + } + status = + db_->put(*txn_batch_, "default", next_file_id_key(), next_registry); + if (!status.ok()) { + throw_db_error("Failed to update next file id", status); + } + } else { + status = db_->put(lookup, new_registry); + if (!status.ok()) { + throw_db_error("Failed to insert file registry", status); + } + status = db_->put(file_reverse_key(file_id), logical_name); + if (!status.ok()) { + throw_db_error("Failed to insert reverse file registry", status); + } + status = db_->put(next_file_id_key(), next_registry); + if (!status.ok()) { + throw_db_error("Failed to update next file id", status); + } + } + + return file_id; } int IndexDatabase::get_file_info_id(std::string_view path) const { - SqliteStmt stmt(db_.get(), "SELECT id FROM files WHERE logical_name=?;"); - stmt.bind_text(1, path); - if (sqlite3_step(stmt) == SQLITE_ROW) { - return sqlite3_column_int(stmt, 0); + std::string value; + auto status = db_->get(file_lookup_key(path), &value); + if (status.IsNotFound()) { + return -1; + } + if (!status.ok()) { + throw_db_error("Failed to look up file info id", status); + } + return decode_file_id(value); +} + +std::optional IndexDatabase::get_file_hash( + std::string_view path) const { + std::string value; + auto status = db_->get(file_lookup_key(path), &value); + if (status.IsNotFound()) { + return std::nullopt; } - return -1; + if (!status.ok()) { + throw_db_error("Failed to look up file hash", status); + } + return decode_file_hash(value); +} + +int IndexDatabase::find_file(std::string_view file_path) const { + return get_file_info_id(internal::get_logical_path(file_path)); } void IndexDatabase::begin_transaction() { - exec_schema(db_.get(), "BEGIN TRANSACTION;", "begin_transaction"); + txn_batch_ = + std::make_unique(db_->begin_batch()); } void IndexDatabase::commit_transaction() { - exec_schema(db_.get(), "COMMIT;", "commit_transaction"); + if (!txn_batch_) { + return; + } + auto status = db_->commit_batch(*txn_batch_); + txn_batch_.reset(); + if (!status.ok()) { + throw_db_error("Failed to commit RocksDB batch", status); + } } -// --------------------------------------------------------------------------- -// Bloom insert operations -// --------------------------------------------------------------------------- +void IndexDatabase::rollback_transaction() noexcept { txn_batch_.reset(); } void IndexDatabase::insert_chunk_bloom_filter( int file_id, std::uint64_t checkpoint_idx, std::string_view dimension, std::span blob_data, std::uint64_t num_entries) { - queries::insert_chunk_bloom_filter( - db_, file_id, checkpoint_idx, dimension, blob_data.data(), - static_cast(blob_data.size()), num_entries); + const auto key = chunk_bloom_key(file_id, dimension, checkpoint_idx); + const auto value = encode_bloom_value(blob_data, num_entries); + auto status = txn_batch_ ? db_->put(*txn_batch_, "chunk_bloom", key, value) + : db_->put(key, value, "chunk_bloom"); + if (!status.ok()) { + throw_db_error("Failed to insert chunk bloom filter", status); + } } void IndexDatabase::insert_chunk_bloom_filter( int file_id, std::uint64_t checkpoint_idx, std::string_view dimension, const void* blob_data, int blob_size, std::uint64_t num_entries) { - queries::insert_chunk_bloom_filter(db_, file_id, checkpoint_idx, dimension, - blob_data, blob_size, num_entries); + auto* bytes = static_cast(blob_data); + insert_chunk_bloom_filter(file_id, checkpoint_idx, dimension, + std::span( + bytes, static_cast(blob_size)), + num_entries); } void IndexDatabase::insert_file_bloom_filter( int file_id, std::string_view dimension, std::span blob_data, std::uint64_t num_entries) { - queries::insert_file_bloom_filter(db_, file_id, dimension, blob_data.data(), - static_cast(blob_data.size()), - num_entries); + const auto key = file_bloom_key(file_id, dimension); + const auto value = encode_bloom_value(blob_data, num_entries); + auto status = txn_batch_ ? db_->put(*txn_batch_, "file_bloom", key, value) + : db_->put(key, value, "file_bloom"); + if (!status.ok()) { + throw_db_error("Failed to insert file bloom filter", status); + } } void IndexDatabase::insert_file_bloom_filter(int file_id, @@ -312,254 +775,734 @@ void IndexDatabase::insert_file_bloom_filter(int file_id, const void* blob_data, int blob_size, std::uint64_t num_entries) { - queries::insert_file_bloom_filter(db_, file_id, dimension, blob_data, - blob_size, num_entries); + auto* bytes = static_cast(blob_data); + insert_file_bloom_filter(file_id, dimension, + std::span( + bytes, static_cast(blob_size)), + num_entries); } void IndexDatabase::insert_chunk_statistics(int file_id, std::uint64_t checkpoint_idx, const ChunkStatistics& stats) { - queries::insert_chunk_statistics(db_, file_id, checkpoint_idx, stats); + const auto key = chunk_stats_key(file_id, checkpoint_idx); + const auto value = encode_chunk_statistics_value(stats); + auto status = txn_batch_ ? db_->put(*txn_batch_, "chunk_stats", key, value) + : db_->put(key, value, "chunk_stats"); + if (!status.ok()) { + throw_db_error("Failed to insert chunk statistics", status); + } +} + +void IndexDatabase::insert_checkpoint(int file_id, + const IndexerCheckpoint& checkpoint) { + const auto key = checkpoint_key(file_id, checkpoint.uc_offset, + checkpoint.checkpoint_idx); + const auto value = encode_checkpoint_value(checkpoint); + auto status = txn_batch_ ? db_->put(*txn_batch_, "checkpoints", key, value) + : db_->put(key, value, "checkpoints"); + if (!status.ok()) { + throw_db_error("Failed to insert checkpoint", status); + } } void IndexDatabase::insert_index_dimension(int file_id, std::string_view dimension) { - queries::insert_index_dimension(db_, file_id, dimension); + const auto key = make_dimension_key(file_id, dimension); + auto status = txn_batch_ ? db_->put(*txn_batch_, "dimensions", key, "") + : db_->put(key, "", "dimensions"); + if (!status.ok()) { + throw_db_error("Failed to insert index dimension", status); + } +} + +void IndexDatabase::insert_hash_resolution(int file_id, + std::string_view dimension, + std::string_view hash_value, + std::string_view resolved_value) { + const auto owner = make_hash_owner_key(file_id, dimension, hash_value); + const auto forward = make_hash_forward_key(dimension, hash_value); + const auto reverse = + make_hash_reverse_key(dimension, resolved_value, hash_value); + if (txn_batch_) { + db_->put(*txn_batch_, "dimensions", owner, std::string(resolved_value)); + db_->put(*txn_batch_, "dimensions", forward, + std::string(resolved_value)); + db_->put(*txn_batch_, "dimensions", reverse, ""); + return; + } + auto status = db_->put(owner, resolved_value, "dimensions"); + if (!status.ok()) throw_db_error("Failed to insert hash owner", status); + status = db_->put(forward, resolved_value, "dimensions"); + if (!status.ok()) + throw_db_error("Failed to insert hash resolution", status); + status = db_->put(reverse, "", "dimensions"); + if (!status.ok()) { + throw_db_error("Failed to insert reverse hash resolution", status); + } } void IndexDatabase::insert_chunk_dimension_stats( int file_id, std::uint64_t checkpoint_idx, const ChunkDimensionStats& stats, std::size_t value_counts_cap) { - queries::insert_chunk_dimension_stats(db_, file_id, checkpoint_idx, stats, - value_counts_cap); + const auto key = + chunk_dim_stats_key(file_id, checkpoint_idx, stats.dimension); + const auto value = + encode_chunk_dimension_stats_value(stats, value_counts_cap); + auto status = txn_batch_ + ? db_->put(*txn_batch_, "chunk_dim_stats", key, value) + : db_->put(key, value, "chunk_dim_stats"); + if (!status.ok()) { + throw_db_error("Failed to insert chunk dimension stats", status); + } } -void IndexDatabase::insert_hash_resolution(int file_id, - std::string_view dimension, - std::string_view hash_value, - std::string_view resolved_value) { - queries::insert_hash_resolution(db_, file_id, dimension, hash_value, - resolved_value); +void IndexDatabase::insert_tar_archive_metadata(int file_id, + std::string_view archive_name, + std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size, + std::uint64_t total_files) { + const auto key = tar_archive_key(file_id); + const auto value = encode_tar_archive_value( + archive_name, checkpoint_size, total_lines, total_uc_size, total_files); + auto status = txn_batch_ ? db_->put(*txn_batch_, "archives", key, value) + : db_->put(key, value, "archives"); + if (!status.ok()) { + throw_db_error("Failed to insert tar archive metadata", status); + } } -// --------------------------------------------------------------------------- -// Bloom query operations -// --------------------------------------------------------------------------- +void IndexDatabase::insert_tar_file(int file_id, const TarFileRecord& record) { + const auto key = + tar_file_key(file_id, record.uncompressed_offset, record.file_name); + const auto value = encode_tar_file_value(record); + auto status = txn_batch_ ? db_->put(*txn_batch_, "tar_files", key, value) + : db_->put(key, value, "tar_files"); + if (!status.ok()) { + throw_db_error("Failed to insert tar file metadata", status); + } +} std::vector IndexDatabase::query_chunk_bloom_filters(int file_id, std::string_view dimension) const { - return queries::query_chunk_bloom_filters(db_, file_id, dimension); + std::vector results; + std::string prefix = prefix_for_file(file_id); + prefix.append(dimension); + prefix.push_back('\0'); + scan_prefix(*db_, "chunk_bloom", prefix, [&](::rocksdb::Iterator& it) { + results.push_back(decode_chunk_bloom( + iterator_key(it), iterator_value(it), prefix.size() - 1)); + }); + return results; } std::unordered_map> IndexDatabase::query_chunk_bloom_filters_batch( int file_id, const std::vector& dimensions) const { - return queries::query_chunk_bloom_filters_batch(db_, file_id, dimensions); + std::unordered_map> results; + for (const auto& dimension : dimensions) { + results.emplace(dimension, + query_chunk_bloom_filters(file_id, dimension)); + } + return results; } std::optional IndexDatabase::query_file_bloom_filter(int file_id, std::string_view dimension) const { - return queries::query_file_bloom_filter(db_, file_id, dimension); + std::string value; + auto status = + db_->get(file_bloom_key(file_id, dimension), &value, "file_bloom"); + if (status.IsNotFound()) { + return std::nullopt; + } + if (!status.ok()) { + throw_db_error("Failed to query file bloom filter", status); + } + return decode_file_bloom(value); } std::unordered_map IndexDatabase::query_file_bloom_filters_batch( int file_id, const std::vector& dimensions) const { - return queries::query_file_bloom_filters_batch(db_, file_id, dimensions); + std::unordered_map results; + for (const auto& dimension : dimensions) { + auto bloom = query_file_bloom_filter(file_id, dimension); + if (bloom) { + results.emplace(dimension, std::move(*bloom)); + } + } + return results; } std::vector IndexDatabase::query_index_dimensions( int file_id) const { - return queries::query_index_dimensions(db_, file_id); + std::vector dimensions; + std::string prefix("d|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, "dimensions", prefix, [&](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + dimensions.push_back(key.substr(prefix.size())); + }); + return dimensions; } bool IndexDatabase::has_index_dimension(int file_id, std::string_view dimension) const { - return queries::has_index_dimension(db_, file_id, dimension); + std::string value; + return db_ + ->get(make_dimension_key(file_id, dimension), &value, "dimensions") + .ok(); } std::vector IndexDatabase::query_chunk_statistics(int file_id) const { - return queries::query_chunk_statistics(db_, file_id); + std::vector results; + const auto prefix = prefix_for_file(file_id); + scan_prefix(*db_, "chunk_stats", prefix, [&](::rocksdb::Iterator& it) { + ChunkStatisticsResult result; + auto key = iterator_key(it); + result.checkpoint_idx = + rocks::KeyCodec::decode_be64(std::string_view(key).substr(4, 8)); + result.stats = decode_chunk_statistics_value(iterator_value(it)); + results.push_back(std::move(result)); + }); + std::sort(results.begin(), results.end(), + [](const auto& lhs, const auto& rhs) { + return lhs.checkpoint_idx < rhs.checkpoint_idx; + }); + return results; +} + +bool IndexDatabase::find_checkpoint(int file_id, std::size_t target_offset, + IndexerCheckpoint& checkpoint) const { + if (target_offset == 0 || file_id < 0) { + return false; + } + + bool found = false; + const auto prefix = prefix_for_file(file_id); + scan_prefix(*db_, "checkpoints", prefix, [&](::rocksdb::Iterator& it) { + auto decoded = decode_checkpoint(iterator_key(it), iterator_value(it)); + if (decoded.uc_offset <= target_offset && + (!found || decoded.uc_offset >= checkpoint.uc_offset)) { + checkpoint = std::move(decoded); + found = true; + } + }); + return found; +} + +std::vector IndexDatabase::query_checkpoints( + int file_id) const { + std::vector checkpoints; + const auto prefix = prefix_for_file(file_id); + scan_prefix(*db_, "checkpoints", prefix, [&](::rocksdb::Iterator& it) { + checkpoints.push_back( + decode_checkpoint(iterator_key(it), iterator_value(it))); + }); + std::sort(checkpoints.begin(), checkpoints.end(), + [](const auto& lhs, const auto& rhs) { + return std::tie(lhs.uc_offset, lhs.checkpoint_idx) < + std::tie(rhs.uc_offset, rhs.checkpoint_idx); + }); + return checkpoints; +} + +std::optional +IndexDatabase::query_tar_archive_metadata(int file_id) const { + std::string value; + auto status = db_->get(tar_archive_key(file_id), &value, "archives"); + if (status.IsNotFound()) { + return std::nullopt; + } + if (!status.ok()) { + throw_db_error("Failed to read tar archive metadata", status); + } + return decode_tar_archive_value(value); +} + +std::vector IndexDatabase::query_tar_files( + int file_id) const { + std::vector files; + const auto prefix = prefix_for_file(file_id); + scan_prefix(*db_, "tar_files", prefix, [&](::rocksdb::Iterator& it) { + files.push_back(decode_tar_file(iterator_key(it), iterator_value(it))); + }); + std::sort(files.begin(), files.end(), [](const auto& lhs, const auto& rhs) { + return std::tie(lhs.uncompressed_offset, lhs.file_name) < + std::tie(rhs.uncompressed_offset, rhs.file_name); + }); + return files; +} + +bool IndexDatabase::find_tar_file(int file_id, std::string_view file_name, + TarFileRecord& record) const { + for (auto& entry : query_tar_files(file_id)) { + if (entry.file_name == file_name) { + record = std::move(entry); + return true; + } + } + return false; +} + +std::vector +IndexDatabase::query_tar_files_in_range(int file_id, std::uint64_t start_offset, + std::uint64_t end_offset) const { + std::vector files; + for (auto& entry : query_tar_files(file_id)) { + const auto entry_end = entry.uncompressed_offset + entry.file_size; + if (entry.uncompressed_offset < end_offset && + entry_end > start_offset) { + files.push_back(std::move(entry)); + } + } + return files; +} + +std::vector +IndexDatabase::query_checkpoints_for_line_range(int file_id, + std::uint64_t start_line, + std::uint64_t end_line) const { + std::vector checkpoints; + for (auto& checkpoint : query_checkpoints(file_id)) { + if ((checkpoint.first_line_num <= end_line && + checkpoint.last_line_num >= start_line) || + (checkpoint.first_line_num <= start_line && + checkpoint.last_line_num >= end_line)) { + checkpoints.push_back(std::move(checkpoint)); + } + } + return checkpoints; } IndexDatabase::TimeBounds IndexDatabase::query_time_bounds(int file_id) const { - return queries::query_time_bounds(db_, file_id); + TimeBounds bounds; + for (const auto& row : query_chunk_statistics(file_id)) { + const auto min_ts = row.stats.min_timestamp_us; + const auto max_ts = row.stats.max_timestamp_us; + if (min_ts == std::numeric_limits::max() || + max_ts == 0) { + continue; + } + bounds.valid = true; + bounds.min_timestamp_us = std::min(bounds.min_timestamp_us, min_ts); + bounds.max_timestamp_us = std::max(bounds.max_timestamp_us, max_ts); + } + return bounds; } std::vector IndexDatabase::query_chunk_dimension_stats(int file_id) const { - return queries::query_chunk_dimension_stats(db_, file_id); + std::vector results; + const auto prefix = prefix_for_file(file_id); + scan_prefix(*db_, "chunk_dim_stats", prefix, [&](::rocksdb::Iterator& it) { + results.push_back(decode_chunk_dimension_stats_value( + iterator_key(it), iterator_value(it))); + }); + std::sort(results.begin(), results.end(), + [](const auto& lhs, const auto& rhs) { + return std::tie(lhs.checkpoint_idx, lhs.dimension) < + std::tie(rhs.checkpoint_idx, rhs.dimension); + }); + return results; } std::vector IndexDatabase::query_chunk_dimension_stats_for_dimension( int file_id, std::string_view dimension) const { - return queries::query_chunk_dimension_stats_for_dimension(db_, file_id, - dimension); + std::vector results; + const auto prefix = prefix_for_file(file_id); + scan_prefix(*db_, "chunk_dim_stats", prefix, [&](::rocksdb::Iterator& it) { + auto decoded = decode_chunk_dimension_stats_value(iterator_key(it), + iterator_value(it)); + if (decoded.dimension == dimension) { + results.push_back(std::move(decoded)); + } + }); + std::sort(results.begin(), results.end(), + [](const auto& lhs, const auto& rhs) { + return lhs.checkpoint_idx < rhs.checkpoint_idx; + }); + return results; } std::optional IndexDatabase::query_resolved_by_hash( std::string_view dimension, std::string_view hash_value) const { - return queries::query_resolved_by_hash(db_, dimension, hash_value); + std::string value; + auto status = db_->get(make_hash_forward_key(dimension, hash_value), &value, + "dimensions"); + if (status.IsNotFound()) { + return std::nullopt; + } + if (!status.ok()) { + throw_db_error("Failed to query resolved hash", status); + } + return value; } std::vector IndexDatabase::query_hash_by_resolved( std::string_view dimension, std::string_view resolved_value) const { - return queries::query_hash_by_resolved(db_, dimension, resolved_value); + std::vector hashes; + auto prefix = make_hash_reverse_key(dimension, resolved_value, ""); + scan_prefix(*db_, "dimensions", prefix, [&](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + hashes.push_back(key.substr(prefix.size())); + }); + return hashes; } -// --------------------------------------------------------------------------- -// Bloom delete operations -// --------------------------------------------------------------------------- - void IndexDatabase::delete_chunk_bloom_filters(int file_id, std::string_view dimension) { - queries::delete_chunk_bloom_filters(db_, file_id, dimension); + std::vector keys; + std::string prefix = prefix_for_file(file_id); + prefix.append(dimension); + prefix.push_back('\0'); + scan_prefix(*db_, "chunk_bloom", prefix, [&](::rocksdb::Iterator& it) { + keys.push_back(iterator_key(it)); + }); + for (const auto& key : keys) { + auto status = txn_batch_ ? db_->del(*txn_batch_, "chunk_bloom", key) + : db_->del(key, "chunk_bloom"); + if (!status.ok()) + throw_db_error("Failed to delete chunk bloom", status); + } } void IndexDatabase::delete_file_bloom_filter(int file_id, std::string_view dimension) { - queries::delete_file_bloom_filter(db_, file_id, dimension); + auto status = + txn_batch_ ? db_->del(*txn_batch_, "file_bloom", + file_bloom_key(file_id, dimension)) + : db_->del(file_bloom_key(file_id, dimension), "file_bloom"); + if (!status.ok() && !status.IsNotFound()) { + throw_db_error("Failed to delete file bloom", status); + } } void IndexDatabase::delete_chunk_statistics(int file_id) { - queries::delete_chunk_statistics(db_, file_id); + std::vector keys; + scan_prefix( + *db_, "chunk_stats", prefix_for_file(file_id), + [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); }); + for (const auto& key : keys) { + auto status = txn_batch_ ? db_->del(*txn_batch_, "chunk_stats", key) + : db_->del(key, "chunk_stats"); + if (!status.ok()) { + throw_db_error("Failed to delete chunk statistics", status); + } + } } void IndexDatabase::delete_chunk_dimension_stats(int file_id) { - queries::delete_chunk_dimension_stats(db_, file_id); + std::vector keys; + scan_prefix( + *db_, "chunk_dim_stats", prefix_for_file(file_id), + [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); }); + for (const auto& key : keys) { + auto status = txn_batch_ ? db_->del(*txn_batch_, "chunk_dim_stats", key) + : db_->del(key, "chunk_dim_stats"); + if (!status.ok()) { + throw_db_error("Failed to delete chunk dimension stats", status); + } + } } void IndexDatabase::delete_hash_resolutions(int file_id) { - queries::delete_hash_resolutions(db_, file_id); + std::vector> owned; + std::string prefix("o|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + prefix.push_back('\0'); + scan_prefix(*db_, "dimensions", prefix, [&](::rocksdb::Iterator& it) { + owned.emplace_back(iterator_key(it), iterator_value(it)); + }); + for (const auto& [owner_key, resolved] : owned) { + if (owner_key.size() <= prefix.size()) { + DFTRACER_UTILS_LOG_WARN( + "Skipping malformed owner key for file_id=%d", file_id); + continue; + } + const std::string_view payload(owner_key.data() + prefix.size(), + owner_key.size() - prefix.size()); + auto split = payload.find('\0'); + if (split == std::string_view::npos) { + DFTRACER_UTILS_LOG_WARN( + "Skipping malformed owner key payload for file_id=%d", file_id); + continue; + } + auto dimension = payload.substr(0, split); + auto hash_value = payload.substr(split + 1); + auto forward = make_hash_forward_key(dimension, hash_value); + auto reverse = make_hash_reverse_key(dimension, resolved, hash_value); + const auto del_one = [&](std::string_view key) { + auto status = txn_batch_ ? db_->del(*txn_batch_, "dimensions", key) + : db_->del(key, "dimensions"); + if (!status.ok() && !status.IsNotFound()) { + throw_db_error("Failed to delete hash resolution", status); + } + }; + del_one(owner_key); + del_one(forward); + del_one(reverse); + } } -// --------------------------------------------------------------------------- -// Manifest insert operations -// --------------------------------------------------------------------------- - void IndexDatabase::insert_event_range( int file_id, std::uint64_t checkpoint_idx, std::string_view cat, std::string_view name, std::span line_numbers) { - queries::insert_event_range(db_, file_id, checkpoint_idx, cat, name, - line_numbers); + const auto key = manifest_event_key(file_id, checkpoint_idx, cat, name); + const auto value = encode_event_range_value(line_numbers); + auto status = txn_batch_ ? db_->put(*txn_batch_, "manifest", key, value) + : db_->put(key, value, "manifest"); + if (!status.ok()) { + throw_db_error("Failed to insert event range", status); + } } void IndexDatabase::insert_event_range( int file_id, std::uint64_t checkpoint_idx, std::string_view cat, std::string_view name, const std::vector& line_numbers) { - queries::insert_event_range(db_, file_id, checkpoint_idx, cat, name, - line_numbers); + insert_event_range(file_id, checkpoint_idx, cat, name, + std::span(line_numbers)); } void IndexDatabase::insert_metadata_lines( int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type, std::span line_numbers) { - queries::insert_metadata_lines(db_, file_id, checkpoint_idx, meta_type, - line_numbers); + const auto key = manifest_metadata_key(file_id, checkpoint_idx, meta_type); + const auto value = encode_metadata_value(line_numbers); + auto status = txn_batch_ ? db_->put(*txn_batch_, "manifest", key, value) + : db_->put(key, value, "manifest"); + if (!status.ok()) { + throw_db_error("Failed to insert metadata lines", status); + } } void IndexDatabase::insert_metadata_lines( int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type, const std::vector& line_numbers) { - queries::insert_metadata_lines(db_, file_id, checkpoint_idx, meta_type, - line_numbers); + insert_metadata_lines(file_id, checkpoint_idx, meta_type, + std::span(line_numbers)); } -// --------------------------------------------------------------------------- -// Manifest query operations -// --------------------------------------------------------------------------- - std::vector IndexDatabase::query_event_ranges( int file_id) const { - return queries::query_event_ranges(db_, file_id); + std::vector results; + std::string prefix("E|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + auto payload = std::string_view(key).substr(2 + 4 + 8); + auto split = payload.find('\0'); + if (split == std::string_view::npos) { + throw std::runtime_error("Corrupt manifest event key"); + } + EventRangeResult result; + result.checkpoint_idx = + rocks::KeyCodec::decode_be64(std::string_view(key).substr(6, 8)); + result.cat = std::string(payload.substr(0, split)); + result.name = std::string(payload.substr(split + 1)); + auto value = iterator_value(it); + Cursor cursor(value); + result.event_count = cursor.u64(); + result.line_numbers = decode_line_numbers(cursor); + results.push_back(std::move(result)); + }); + std::sort(results.begin(), results.end(), + [](const auto& lhs, const auto& rhs) { + return std::tie(lhs.checkpoint_idx, lhs.cat, lhs.name) < + std::tie(rhs.checkpoint_idx, rhs.cat, rhs.name); + }); + return results; } std::vector IndexDatabase::query_event_ranges_for_checkpoint( int file_id, std::uint64_t checkpoint_idx) const { - return queries::query_event_ranges_for_checkpoint(db_, file_id, - checkpoint_idx); + std::vector results; + for (auto& range : query_event_ranges(file_id)) { + if (range.checkpoint_idx == checkpoint_idx) { + results.push_back(std::move(range)); + } + } + return results; } std::vector IndexDatabase::query_metadata_lines(int file_id) const { - return queries::query_metadata_lines(db_, file_id); + std::vector results; + std::string prefix("M|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) { + auto key = iterator_key(it); + MetadataLinesResult result; + result.checkpoint_idx = + rocks::KeyCodec::decode_be64(std::string_view(key).substr(6, 8)); + result.meta_type = key.substr(14); + auto value = iterator_value(it); + Cursor cursor(value); + result.line_numbers = decode_line_numbers(cursor); + results.push_back(std::move(result)); + }); + std::sort(results.begin(), results.end(), + [](const auto& lhs, const auto& rhs) { + return std::tie(lhs.checkpoint_idx, lhs.meta_type) < + std::tie(rhs.checkpoint_idx, rhs.meta_type); + }); + return results; } std::vector IndexDatabase::query_metadata_lines_for_checkpoint( int file_id, std::uint64_t checkpoint_idx) const { - return queries::query_metadata_lines_for_checkpoint(db_, file_id, - checkpoint_idx); + std::vector results; + for (auto& lines : query_metadata_lines(file_id)) { + if (lines.checkpoint_idx == checkpoint_idx) { + results.push_back(std::move(lines)); + } + } + return results; } -// --------------------------------------------------------------------------- -// Manifest delete operations -// --------------------------------------------------------------------------- - void IndexDatabase::delete_event_ranges(int file_id) { - queries::delete_event_ranges(db_, file_id); + std::vector keys; + std::string prefix("E|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) { + keys.push_back(iterator_key(it)); + }); + for (const auto& key : keys) { + auto status = txn_batch_ ? db_->del(*txn_batch_, "manifest", key) + : db_->del(key, "manifest"); + if (!status.ok()) { + throw_db_error("Failed to delete manifest event ranges", status); + } + } } void IndexDatabase::delete_metadata_lines(int file_id) { - queries::delete_metadata_lines(db_, file_id); + std::vector keys; + std::string prefix("M|"); + rocks::KeyCodec::append_be32(prefix, static_cast(file_id)); + scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) { + keys.push_back(iterator_key(it)); + }); + for (const auto& key : keys) { + auto status = txn_batch_ ? db_->del(*txn_batch_, "manifest", key) + : db_->del(key, "manifest"); + if (!status.ok()) { + throw_db_error("Failed to delete metadata lines", status); + } + } } std::uint64_t IndexDatabase::get_total_events(int file_id) const { - // Exact count from chunk_statistics (populated when bloom was built) - try { - SqliteStmt stmt(db_, - "SELECT SUM(total_events) FROM chunk_statistics " - "WHERE file_info_id = ?;"); - stmt.bind_int(1, file_id); - if (sqlite3_step(stmt) == SQLITE_ROW && - sqlite3_column_type(stmt, 0) != SQLITE_NULL) { - auto val = sqlite3_column_int64(stmt, 0); - if (val > 0) return static_cast(val); - } - } catch (...) { - // Table may not exist if bloom was never built + std::uint64_t total = 0; + for (const auto& row : query_chunk_statistics(file_id)) { + total += row.stats.total_events; } - // Fallback: num_lines (approximate, might include array delimiters) - return get_num_lines(file_id); + return total > 0 ? total : get_num_lines(file_id); } -int IndexDatabase::find_file(std::string_view file_path) const { - auto logical = internal::get_logical_path(file_path); - return get_file_info_id(logical); +void IndexDatabase::insert_file_metadata(int file_id, + std::uint64_t checkpoint_size, + std::uint64_t total_lines, + std::uint64_t total_uc_size) { + const auto key = metadata_key(file_id); + const auto value = + encode_metadata_record(checkpoint_size, total_lines, total_uc_size); + auto status = txn_batch_ ? db_->put(*txn_batch_, "metadata", key, value) + : db_->put(key, value, "metadata"); + if (!status.ok()) { + throw_db_error("Failed to insert metadata", status); + } +} + +std::uint64_t IndexDatabase::get_checkpoint_size(int file_id) const { + std::string value; + auto status = db_->get(metadata_key(file_id), &value, "metadata"); + if (status.IsNotFound()) { + return 0; + } + if (!status.ok()) { + throw_db_error("Failed to read metadata", status); + } + return decode_metadata_record(value)[0]; } std::uint64_t IndexDatabase::get_num_lines(int file_id) const { - SqliteStmt stmt(db_, "SELECT total_lines FROM metadata WHERE file_id = ?;"); - stmt.bind_int(1, file_id); - if (sqlite3_step(stmt) == SQLITE_ROW) { - return static_cast(sqlite3_column_int64(stmt, 0)); + std::string value; + auto status = db_->get(metadata_key(file_id), &value, "metadata"); + if (status.IsNotFound()) { + return 0; + } + if (!status.ok()) { + throw_db_error("Failed to read metadata", status); } - return 0; + return decode_metadata_record(value)[1]; } std::uint64_t IndexDatabase::get_max_bytes(int file_id) const { - // Primary: metadata table has the authoritative total uncompressed size - SqliteStmt stmt(db_, - "SELECT total_uc_size FROM metadata WHERE file_id = ?;"); - stmt.bind_int(1, file_id); - if (sqlite3_step(stmt) == SQLITE_ROW) { - auto val = sqlite3_column_int64(stmt, 0); - if (val > 0) return static_cast(val); - } - // Fallback: sum from checkpoints - SqliteStmt stmt2( - db_, - "SELECT MAX(uc_offset + uc_size) FROM checkpoints WHERE file_id = ?;"); - stmt2.bind_int(1, file_id); - if (sqlite3_step(stmt2) == SQLITE_ROW) { - return static_cast(sqlite3_column_int64(stmt2, 0)); - } - return 0; + std::string value; + auto status = db_->get(metadata_key(file_id), &value, "metadata"); + if (status.IsNotFound()) { + return 0; + } + if (!status.ok()) { + throw_db_error("Failed to read metadata", status); + } + return decode_metadata_record(value)[2]; +} + +void IndexDatabase::delete_file_data(int file_id) { + auto delete_default_key = [&](std::string_view key) { + auto del_status = + txn_batch_ ? db_->del(*txn_batch_, "default", key) : db_->del(key); + if (!del_status.ok() && !del_status.IsNotFound()) { + throw_db_error("Failed to delete file registry entry", del_status); + } + }; + + const auto logical_name_key = file_reverse_key(file_id); + std::string logical_name; + auto status = db_->get(logical_name_key, &logical_name); + if (status.ok()) { + delete_default_key(file_lookup_key(logical_name)); + delete_default_key(logical_name_key); + } else if (!status.IsNotFound()) { + throw_db_error("Failed to read reverse file registry", status); + } + + auto delete_prefix = [&](std::string_view cf, std::string_view prefix) { + std::vector keys; + scan_prefix(*db_, cf, prefix, [&](::rocksdb::Iterator& it) { + keys.push_back(iterator_key(it)); + }); + for (const auto& key : keys) { + auto del_status = + txn_batch_ ? db_->del(*txn_batch_, cf, key) : db_->del(key, cf); + if (!del_status.ok() && !del_status.IsNotFound()) { + throw_db_error("Failed to delete file-scoped RocksDB data", + del_status); + } + } + }; + + delete_prefix("checkpoints", prefix_for_file(file_id)); + delete_prefix("metadata", prefix_for_file(file_id)); + delete_prefix("archives", prefix_for_file(file_id)); + delete_prefix("tar_files", prefix_for_file(file_id)); + delete_prefix("chunk_bloom", prefix_for_file(file_id)); + delete_prefix("file_bloom", prefix_for_file(file_id)); + delete_prefix("chunk_stats", prefix_for_file(file_id)); + delete_prefix("chunk_dim_stats", prefix_for_file(file_id)); + delete_prefix("dimensions", std::string("d|") + prefix_for_file(file_id)); + delete_prefix("manifest", std::string("E|") + prefix_for_file(file_id)); + delete_prefix("manifest", std::string("M|") + prefix_for_file(file_id)); + delete_hash_resolutions(file_id); } } // namespace dftracer::utils::utilities::indexer diff --git a/src/dftracer/utils/utilities/indexer/internal/checkpoint_size.h b/src/dftracer/utils/utilities/indexer/internal/checkpoint_size.h index de7991e7..81643312 100644 --- a/src/dftracer/utils/utilities/indexer/internal/checkpoint_size.h +++ b/src/dftracer/utils/utilities/indexer/internal/checkpoint_size.h @@ -12,7 +12,7 @@ namespace dftracer::utils::utilities::indexer::internal { std::size_t determine_checkpoint_size( std::size_t user_checkpoint_size, const std::string& path, // Tunables: - std::size_t max_chk = (512u << 20), std::size_t max_parts = 100000000, + std::size_t max_parts = 100000000, std::size_t max_chk = (512u << 20), // default: std::size_t window = constants::indexer::ZLIB_WINDOW_SIZE); diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp index 7de88c5f..68f102b5 100644 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp +++ b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp @@ -3,43 +3,58 @@ #include #include #include -#include -#include +#include +#include #include #include #include #include #include #include -#include #include +#include #include #include #include +#include #include namespace dftracer::utils::utilities::indexer::internal::gzip { -using dftracer::utils::sqlite::SqliteStmt; - -// Import the SQL_SCHEMA from constants -extern const char *const &SQL_SCHEMA; +using dftracer::utils::utilities::indexer::IndexDatabase; +namespace rocks = dftracer::utils::rocksdb; + +namespace { + +void finalize_checkpoints(std::vector& checkpoints, + std::uint64_t total_uc_size, + std::uint64_t total_lines, + std::uint64_t tail_line_count) { + for (std::size_t i = 0; i < checkpoints.size(); ++i) { + auto& checkpoint = checkpoints[i]; + const std::uint64_t next_uc_offset = (i + 1 < checkpoints.size()) + ? checkpoints[i + 1].uc_offset + : total_uc_size; + const std::uint64_t next_c_offset = (i + 1 < checkpoints.size()) + ? checkpoints[i + 1].c_offset + : checkpoint.c_offset; + checkpoint.uc_size = next_uc_offset - checkpoint.uc_offset; + checkpoint.c_size = next_c_offset - checkpoint.c_offset; + } -static void init_schema(const SqliteDatabase &db) { - DFTRACER_UTILS_LOG_DEBUG("%s", "Initializing GZIP indexer schema"); - int rc = sqlite3_exec(db.get(), SQL_SCHEMA, NULL, NULL, NULL); - if (rc != SQLITE_OK) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to initialize schema: " + - std::string(sqlite3_errmsg(db.get()))); + if (tail_line_count > 0 && total_lines > 0 && !checkpoints.empty()) { + auto& last = checkpoints.back(); + last.last_line_num = total_lines; + last.num_lines += tail_line_count; } } static dftracer::utils::coro::CoroTask process_chunks( - int fd, const SqliteDatabase &db, int file_id, std::uint64_t ckpt_size, - std::uint64_t &total_lines, std::uint64_t &total_uc_size, - std::uint64_t &tail_line_count, const Indexer::VisitorList &visitors) { + int fd, std::uint64_t ckpt_size, std::uint64_t& total_lines, + std::uint64_t& total_uc_size, std::uint64_t& tail_line_count, + std::vector& checkpoints, + const Indexer::VisitorList& visitors) { GzipInflater inflater; off_t offset = 0; if (!(co_await inflater.initialize(fd))) { @@ -50,9 +65,8 @@ static dftracer::utils::coro::CoroTask process_chunks( std::uint64_t current_uc_offset = 0; std::uint64_t next_ckpt_offset = ckpt_size; std::uint64_t line_count_in_chunk = 0; - std::uint64_t first_line_in_chunk = total_lines + 1; // 1-based + std::uint64_t first_line_in_chunk = total_lines + 1; - // Partial-line accumulator for visitor dispatch. std::string line_buf; const bool has_visitors = !visitors.empty(); @@ -60,13 +74,13 @@ static dftracer::utils::coro::CoroTask process_chunks( GzipInflaterResult result; if (!(co_await inflater.read(fd, offset, result))) { if (result.bytes_read == 0) { - break; // EOF + break; } - co_return false; // Error + co_return false; } if (result.bytes_read == 0) { - break; // EOF + break; } current_uc_offset += result.bytes_read; @@ -74,65 +88,59 @@ static dftracer::utils::coro::CoroTask process_chunks( line_count_in_chunk += result.lines_found; if (has_visitors) { - const auto *data = inflater.out_buffer; + const auto* data = inflater.out_buffer; const std::size_t n = result.bytes_read; std::size_t seg_start = 0; for (std::size_t i = 0; i < n; ++i) { if (data[i] == '\n') { line_buf.append( - reinterpret_cast(data + seg_start), + reinterpret_cast(data + seg_start), i - seg_start); std::string_view line_sv(line_buf); - for (auto &v : visitors) { - v.get().on_line(line_sv, checkpoint_idx); + for (auto& visitor : visitors) { + visitor.get().on_line(line_sv, checkpoint_idx); } line_buf.clear(); seg_start = i + 1; } } - // Accumulate any trailing bytes that don't end with '\n'. if (seg_start < n) { - line_buf.append( - reinterpret_cast(data + seg_start), - n - seg_start); + line_buf.append(reinterpret_cast(data + seg_start), + n - seg_start); } } - // Create checkpoint when we cross a boundary and are at a deflate - // block boundary (read() now stops at block boundaries via Z_BLOCK). if (current_uc_offset >= next_ckpt_offset && result.at_block_boundary) { - std::size_t chunk_start_uc = current_uc_offset; - std::size_t chunk_start_c = inflater.get_total_input_consumed(); + const std::size_t chunk_start_uc = current_uc_offset; + const std::size_t chunk_start_c = + inflater.get_total_input_consumed(); GzipCheckpointer checkpointer(inflater, chunk_start_uc); if (checkpointer.create(chunk_start_c)) { std::vector compressed_dict; if (checkpointer.compress(compressed_dict)) { - InsertCheckpointData checkpoint_data = { - checkpoint_idx++, - chunk_start_uc, - 0, // uc_size - will be updated later - 0, // c_size - will be updated later - chunk_start_c, - checkpointer.bits, - compressed_dict.data(), - compressed_dict.size(), - line_count_in_chunk, - first_line_in_chunk, - total_lines}; // 1-based: last line = total_lines - co_await dftracer::utils::sqlite::run([&] { - insert_checkpoint_record(db, file_id, checkpoint_data); - }); + IndexerCheckpoint checkpoint{ + .checkpoint_idx = checkpoint_idx++, + .uc_offset = chunk_start_uc, + .uc_size = 0, + .c_offset = chunk_start_c, + .c_size = 0, + .bits = checkpointer.bits, + .dict_compressed = std::move(compressed_dict), + .num_lines = line_count_in_chunk, + .first_line_num = first_line_in_chunk, + .last_line_num = total_lines, + }; + checkpoints.push_back(std::move(checkpoint)); if (has_visitors) { - for (auto &v : visitors) { - v.get().on_checkpoint(checkpoint_idx - 1); + for (auto& visitor : visitors) { + visitor.get().on_checkpoint(checkpoint_idx - 1); } } - // Reset chunk counters for next chunk line_count_in_chunk = 0; - first_line_in_chunk = total_lines + 1; // 1-based + first_line_in_chunk = total_lines + 1; next_ckpt_offset = current_uc_offset + ckpt_size; } } @@ -144,103 +152,74 @@ static dftracer::utils::coro::CoroTask process_chunks( co_return true; } -// After all checkpoints are inserted, compute uc_size / c_size for each -// and extend the last checkpoint's line range to cover the tail data. -static void finalize_checkpoints(const SqliteDatabase &db, int file_id, - std::uint64_t total_uc_size, - std::uint64_t total_lines, - std::uint64_t tail_line_count) { - // 1. Set uc_size = distance to next checkpoint (or total_uc_size for last). - { - SqliteStmt stmt( - db, - "UPDATE checkpoints SET " - "uc_size = COALESCE(" - " (SELECT c2.uc_offset FROM checkpoints c2 " - " WHERE c2.file_id = checkpoints.file_id " - " AND c2.checkpoint_idx = checkpoints.checkpoint_idx + 1), ?" - ") - uc_offset, " - "c_size = COALESCE(" - " (SELECT c2.c_offset FROM checkpoints c2 " - " WHERE c2.file_id = checkpoints.file_id " - " AND c2.checkpoint_idx = checkpoints.checkpoint_idx + 1), " - " c_offset" - ") - c_offset " - "WHERE file_id = ?"); - stmt.bind_int64(1, static_cast(total_uc_size)); - stmt.bind_int(2, file_id); - sqlite3_step(stmt.get()); - } - - // 2. Extend the last checkpoint's line range to cover the tail data - // (lines after the last block boundary that didn't trigger a new - // checkpoint). - if (tail_line_count > 0 && total_lines > 0) { - SqliteStmt stmt( - db, - "UPDATE checkpoints SET last_line_num = ?, " - "num_lines = num_lines + ? " - "WHERE file_id = ? AND checkpoint_idx = " - "(SELECT MAX(checkpoint_idx) FROM checkpoints WHERE file_id = ?)"); - stmt.bind_int64(1, static_cast(total_lines)); - stmt.bind_int64(2, static_cast(tail_line_count)); - stmt.bind_int(3, file_id); - stmt.bind_int(4, file_id); - sqlite3_step(stmt.get()); - } -} - static dftracer::utils::coro::CoroTask build_index( - const SqliteDatabase &db, int file_id, const std::string &gz_path, - std::uint64_t ckpt_size, const Indexer::VisitorList &visitors) { + IndexDatabase& db, int file_id, const std::string& gz_path, + std::uint64_t ckpt_size, const Indexer::VisitorList& visitors) { int fd = ::open(gz_path.c_str(), O_RDONLY); if (fd < 0) { co_return false; } if (!visitors.empty()) { - std::uint64_t compressed_bytes = file_size_bytes(gz_path); - std::size_t estimated = static_cast( + const std::uint64_t compressed_bytes = file_size_bytes(gz_path); + const std::size_t estimated = static_cast( compressed_bytes / (ckpt_size > 0 ? ckpt_size : 1)); - for (auto &v : visitors) { - v.get().begin(estimated); + for (auto& visitor : visitors) { + visitor.get().begin(estimated); } } std::uint64_t total_lines = 0; std::uint64_t total_uc_size = 0; std::uint64_t tail_line_count = 0; + std::vector checkpoints; - bool success = - co_await process_chunks(fd, db, file_id, ckpt_size, total_lines, - total_uc_size, tail_line_count, visitors); + const bool success = + co_await process_chunks(fd, ckpt_size, total_lines, total_uc_size, + tail_line_count, checkpoints, visitors); ::close(fd); - if (success) { - co_await dftracer::utils::sqlite::run([&] { - finalize_checkpoints(db, file_id, total_uc_size, total_lines, - tail_line_count); - insert_file_metadata_record(db, file_id, ckpt_size, total_lines, - total_uc_size); - }); + if (!success) { + co_return false; } - co_return success; + finalize_checkpoints(checkpoints, total_uc_size, total_lines, + tail_line_count); + + auto* db_ptr = &db; + auto* checkpoints_ptr = &checkpoints; + co_await rocks::run([db_ptr, file_id, ckpt_size, total_lines, total_uc_size, + checkpoints_ptr] { + internal::TransactionScope txn(*db_ptr); + for (const auto& checkpoint : *checkpoints_ptr) { + db_ptr->insert_checkpoint(file_id, checkpoint); + } + db_ptr->insert_file_metadata(file_id, ckpt_size, total_lines, + total_uc_size); + txn.commit(); + }); + + co_return true; } -GzipIndexer::GzipIndexer(const std::string &gz_path_, - const std::string &idx_path_, std::uint64_t ckpt_size_, +} // namespace + +GzipIndexer::GzipIndexer(const std::string& gz_path_, + const std::string& idx_path_, std::uint64_t ckpt_size_, bool force_rebuild_) : gz_path(gz_path_), gz_path_logical_path(get_logical_path(gz_path_)), - idx_path(idx_path_), + index_path(normalize_index_root(idx_path_)), ckpt_size(ckpt_size_), force_rebuild(force_rebuild_), cached_is_valid(false), cached_file_id(-1), cached_max_bytes(0), + cached_max_bytes_ready(false), cached_num_lines(0), - cached_checkpoint_size(0) { + cached_num_lines_ready(false), + cached_checkpoint_size(0), + cached_checkpoint_size_ready(false) { if (gz_path.empty()) { throw IndexerError(IndexerError::Type::INVALID_ARGUMENT, "gz_path must not be empty"); @@ -264,78 +243,65 @@ GzipIndexer::~GzipIndexer() { close(); } -GzipIndexer::GzipIndexer(GzipIndexer &&other) noexcept +GzipIndexer::GzipIndexer(GzipIndexer&& other) noexcept : gz_path(std::move(other.gz_path)), gz_path_logical_path(std::move(other.gz_path_logical_path)), - idx_path(std::move(other.idx_path)), + index_path(std::move(other.index_path)), ckpt_size(other.ckpt_size), force_rebuild(other.force_rebuild), - db(std::move(other.db)), visitors_(std::move(other.visitors_)), cached_is_valid(other.cached_is_valid.load()), cached_file_id(other.cached_file_id.load()), cached_max_bytes(other.cached_max_bytes.load()), + cached_max_bytes_ready(other.cached_max_bytes_ready.load()), cached_num_lines(other.cached_num_lines.load()), + cached_num_lines_ready(other.cached_num_lines_ready.load()), cached_checkpoint_size(other.cached_checkpoint_size.load()), + cached_checkpoint_size_ready(other.cached_checkpoint_size_ready.load()), cached_checkpoints(std::move(other.cached_checkpoints)) {} -GzipIndexer &GzipIndexer::operator=(GzipIndexer &&other) noexcept { +GzipIndexer& GzipIndexer::operator=(GzipIndexer&& other) noexcept { if (this != &other) { gz_path = std::move(other.gz_path); gz_path_logical_path = std::move(other.gz_path_logical_path); - idx_path = std::move(other.idx_path); + index_path = std::move(other.index_path); ckpt_size = other.ckpt_size; force_rebuild = other.force_rebuild; - db = std::move(other.db); visitors_ = std::move(other.visitors_); cached_is_valid.store(other.cached_is_valid.load()); cached_file_id.store(other.cached_file_id.load()); cached_max_bytes.store(other.cached_max_bytes.load()); + cached_max_bytes_ready.store(other.cached_max_bytes_ready.load()); cached_num_lines.store(other.cached_num_lines.load()); + cached_num_lines_ready.store(other.cached_num_lines_ready.load()); cached_checkpoint_size.store(other.cached_checkpoint_size.load()); + cached_checkpoint_size_ready.store( + other.cached_checkpoint_size_ready.load()); std::lock_guard lock(cached_checkpoints_mutex); cached_checkpoints = std::move(other.cached_checkpoints); } return *this; } -void GzipIndexer::open() { - if (!db.open(idx_path)) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to open database at " + idx_path); - } -} +void GzipIndexer::open() {} -void GzipIndexer::close() { - DFTRACER_UTILS_LOG_DEBUG("Closing GZIP indexer database for %s", - gz_path.c_str()); - db.close(); -} +void GzipIndexer::close() {} dftracer::utils::coro::CoroTask GzipIndexer::build_async() const { if (!force_rebuild && !need_rebuild()) { co_return; } - co_await dftracer::utils::sqlite::run([&] { - init_schema(db); - - int fid = find_file_id(gz_path_logical_path); - if (fid != -1) { - delete_file_record(db, fid); - } - }); - - std::time_t mtime = get_file_modification_time(gz_path); - auto hash = calculate_file_hash(gz_path); - std::uint64_t bytes = file_size_bytes(gz_path); - std::uint64_t final_ckpt_size = + IndexDatabase db(index_path); + const std::time_t mtime = get_file_modification_time(gz_path); + const auto hash = calculate_file_hash(gz_path); + const std::uint64_t bytes = file_size_bytes(gz_path); + const std::uint64_t final_ckpt_size = determine_checkpoint_size(ckpt_size, gz_path); - - int file_id = co_await dftracer::utils::sqlite::run([&] { - int fid; - insert_file_record(db, gz_path_logical_path, bytes, mtime, hash, fid); - return fid; + const std::string logical = gz_path_logical_path; + const auto* logical_ptr = &logical; + const int file_id = co_await rocks::run([db_ptr = &db, logical_ptr, hash] { + return db_ptr->get_or_create_file_info(*logical_ptr, hash); }); if (!(co_await build_index(db, file_id, gz_path, final_ckpt_size, @@ -344,100 +310,157 @@ dftracer::utils::coro::CoroTask GzipIndexer::build_async() const { "Failed to build index for " + gz_path); } + (void)mtime; + (void)bytes; + struct CacheSnapshot { + std::uint64_t num_lines = 0; + std::uint64_t max_bytes = 0; + std::vector checkpoints; + }; + auto snapshot = co_await rocks::run([db_ptr = &db, file_id] { + CacheSnapshot cache; + cache.num_lines = db_ptr->get_num_lines(file_id); + cache.max_bytes = db_ptr->get_max_bytes(file_id); + cache.checkpoints = db_ptr->query_checkpoints(file_id); + return cache; + }); + cached_is_valid = true; cached_file_id = file_id; + cached_checkpoint_size = final_ckpt_size; + cached_checkpoint_size_ready = true; + cached_num_lines = snapshot.num_lines; + cached_num_lines_ready = true; + cached_max_bytes = snapshot.max_bytes; + cached_max_bytes_ready = true; + std::lock_guard lock(cached_checkpoints_mutex); + cached_checkpoints = std::move(snapshot.checkpoints); co_return; } bool GzipIndexer::is_valid() const { return cached_is_valid; } -bool GzipIndexer::exists() const { return fs::exists(idx_path); } +bool GzipIndexer::exists() const { + return fs::exists(index_path) && fs::is_directory(index_path); +} bool GzipIndexer::need_rebuild() const { - if (is_valid()) return false; - if (!exists()) return true; - - // Only query schema if database exists - matches original behavior - if (!query_schema_validity(db)) return true; - - std::uint64_t stored_hash; - std::time_t stored_mtime; - if (!query_stored_file_info(db, gz_path_logical_path, stored_hash, - stored_mtime)) { + if (is_valid()) { + return false; + } + if (!exists()) { return true; } - // Fast path: if mtime matches, the file hasn't changed. - std::time_t current_mtime = get_file_modification_time(gz_path); - if (stored_mtime == current_mtime) return false; + try { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + const auto stored_hash = db.get_file_hash(gz_path_logical_path); + const int file_id = db.get_file_info_id(gz_path_logical_path); + if (!stored_hash || file_id < 0) { + return true; + } - // mtime differs, verify with sampled fingerprint (size + head/tail) - // to handle edge cases like file replacement within the same second. - std::uint64_t current_hash = calculate_file_hash(gz_path); - return stored_hash != current_hash; + const auto current_hash = calculate_file_hash(gz_path); + const auto current_ckpt_size = db.get_checkpoint_size(file_id); + return current_hash != *stored_hash || current_ckpt_size == 0; + } catch (...) { + return true; + } } -const std::string &GzipIndexer::get_idx_path() const { return idx_path; } +const std::string& GzipIndexer::get_index_path() const { return index_path; } -const std::string &GzipIndexer::get_archive_path() const { return gz_path; } +const std::string& GzipIndexer::get_archive_path() const { return gz_path; } -const std::string &GzipIndexer::get_gz_path() const { return gz_path; } +const std::string& GzipIndexer::get_gz_path() const { return gz_path; } std::uint64_t GzipIndexer::get_max_bytes() const { - auto val = cached_max_bytes.load(std::memory_order_relaxed); - if (val == 0) { - val = query_max_bytes(db, gz_path_logical_path); - cached_max_bytes.store(val, std::memory_order_relaxed); + if (!cached_max_bytes_ready.load(std::memory_order_acquire)) { + const int file_id = get_file_id(); + if (file_id != -1) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto val = db.get_max_bytes(file_id); + cached_max_bytes.store(val, std::memory_order_relaxed); + cached_max_bytes_ready.store(true, std::memory_order_release); + } } - return val; + return cached_max_bytes.load(std::memory_order_relaxed); } std::uint64_t GzipIndexer::get_checkpoint_size() const { - auto val = cached_checkpoint_size.load(std::memory_order_relaxed); - if (val == 0) { - int file_id = get_file_id(); + if (!cached_checkpoint_size_ready.load(std::memory_order_acquire)) { + const int file_id = get_file_id(); if (file_id != -1) { - val = query_checkpoint_size(db, file_id); + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto val = db.get_checkpoint_size(file_id); cached_checkpoint_size.store(val, std::memory_order_relaxed); + cached_checkpoint_size_ready.store(true, std::memory_order_release); } } - return val; + return cached_checkpoint_size.load(std::memory_order_relaxed); } std::uint64_t GzipIndexer::get_num_lines() const { - auto val = cached_num_lines.load(std::memory_order_relaxed); - if (val == 0) { - val = query_num_lines(db, gz_path_logical_path); - cached_num_lines.store(val, std::memory_order_relaxed); + if (!cached_num_lines_ready.load(std::memory_order_acquire)) { + const int file_id = get_file_id(); + if (file_id != -1) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto val = db.get_num_lines(file_id); + cached_num_lines.store(val, std::memory_order_relaxed); + cached_num_lines_ready.store(true, std::memory_order_release); + } } - return val; + return cached_num_lines.load(std::memory_order_relaxed); } int GzipIndexer::get_file_id() const { auto val = cached_file_id.load(std::memory_order_relaxed); if (val == -1) { - val = query_file_id(db, gz_path_logical_path); + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + val = db.get_file_info_id(gz_path_logical_path); cached_file_id.store(val, std::memory_order_relaxed); } return val; } -int GzipIndexer::find_file_id(const std::string &path) const { - return query_file_id(db, get_logical_path(path)); +int GzipIndexer::find_file_id(const std::string& path) const { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + return db.get_file_info_id(get_logical_path(path)); } bool GzipIndexer::find_checkpoint(std::size_t target_offset, - IndexerCheckpoint &checkpoint) const { - int file_id = get_file_id(); - if (file_id == -1) return false; - return query_checkpoint(db, target_offset, file_id, checkpoint); + IndexerCheckpoint& checkpoint) const { + const int file_id = get_file_id(); + if (file_id == -1) { + return false; + } + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + return db.find_checkpoint(file_id, target_offset, checkpoint); } std::vector GzipIndexer::get_checkpoints() const { + std::lock_guard lock(cached_checkpoints_mutex); if (cached_checkpoints.empty()) { - int file_id = get_file_id(); + const int file_id = get_file_id(); if (file_id != -1) { - cached_checkpoints = query_checkpoints(db, file_id); + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + cached_checkpoints = db.query_checkpoints(file_id); } } return cached_checkpoints; @@ -445,9 +468,14 @@ std::vector GzipIndexer::get_checkpoints() const { std::vector GzipIndexer::get_checkpoints_for_line_range( std::uint64_t start_line, std::uint64_t end_line) const { - int file_id = get_file_id(); - if (file_id == -1) return {}; - return query_checkpoints_for_line_range(db, file_id, start_line, end_line); + const int file_id = get_file_id(); + if (file_id == -1) { + return {}; + } + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + return db.query_checkpoints_for_line_range(file_id, start_line, end_line); } } // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h index 1988b552..94c0f04b 100644 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h +++ b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include @@ -18,14 +18,12 @@ namespace dftracer::utils::utilities::indexer::internal::gzip { -using dftracer::utils::sqlite::SqliteDatabase; - class GzipIndexer : public Indexer { public: static constexpr std::uint64_t DEFAULT_CHECKPOINT_SIZE = constants::indexer::DEFAULT_CHECKPOINT_SIZE; - GzipIndexer(const std::string &gz_path, const std::string &idx_path, + GzipIndexer(const std::string &gz_path, const std::string &index_path, std::uint64_t checkpoint_size = DEFAULT_CHECKPOINT_SIZE, bool force = false); ~GzipIndexer(); @@ -43,7 +41,7 @@ class GzipIndexer : public Indexer { } // Metadata - BaseIndexer interface implementation - const std::string &get_idx_path() const override; + const std::string &get_index_path() const override; const std::string &get_archive_path() const override; const std::string &get_gz_path() const; std::uint64_t get_checkpoint_size() const override; @@ -69,18 +67,20 @@ class GzipIndexer : public Indexer { private: std::string gz_path; std::string gz_path_logical_path; - std::string idx_path; + std::string index_path; std::uint64_t ckpt_size; bool force_rebuild; - SqliteDatabase db; VisitorList visitors_; // Cached values (atomic for thread-safe lazy initialization) mutable std::atomic cached_is_valid{false}; mutable std::atomic cached_file_id{-1}; mutable std::atomic cached_max_bytes{0}; + mutable std::atomic cached_max_bytes_ready{false}; mutable std::atomic cached_num_lines{0}; + mutable std::atomic cached_num_lines_ready{false}; mutable std::atomic cached_checkpoint_size{0}; + mutable std::atomic cached_checkpoint_size_ready{false}; mutable std::vector cached_checkpoints; mutable std::mutex cached_checkpoints_mutex; diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/delete_file_record.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/delete_file_record.cpp deleted file mode 100644 index 984ae506..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/delete_file_record.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -bool delete_file_record(const SqliteDatabase &db, int file_id) { - const char *cleanup_queries[] = { - "DELETE FROM checkpoints WHERE file_id = ?;", - "DELETE FROM metadata WHERE file_id = ?;"}; - - for (const char *query : cleanup_queries) { - try { - SqliteStmt stmt(db, query); - stmt.bind_int(1, file_id); - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - DFTRACER_UTILS_LOG_ERROR( - "Failed to execute cleanup statement '%s' for file_id %d: " - "%d - " - "%s", - query, file_id, result, sqlite3_errmsg(db.get())); - return false; - } - } catch (const IndexerError &e) { - DFTRACER_UTILS_LOG_ERROR( - "Failed to prepare cleanup statement '%s': %s", query, - e.what()); - return false; - } - } - DFTRACER_UTILS_LOG_DEBUG( - "Successfully cleaned up existing data for file_id %d", file_id); - return true; -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_checkpoint_record.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_checkpoint_record.cpp deleted file mode 100644 index dfbc6d02..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_checkpoint_record.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -void insert_checkpoint_record(const SqliteDatabase &db, int file_id, - const InsertCheckpointData &data) { - SqliteStmt stmt( - db, - "INSERT INTO checkpoints(file_id, checkpoint_idx, uc_offset, " - "uc_size, c_offset, c_size, bits, dict_compressed, num_lines, " - "first_line_num, last_line_num) " - "VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"); - - stmt.bind_int(1, file_id); - stmt.bind_int64(2, static_cast(data.idx)); - stmt.bind_int64(3, static_cast(data.uc_offset)); - stmt.bind_int64(4, static_cast(data.uc_size)); - stmt.bind_int64(5, static_cast(data.c_offset)); - stmt.bind_int64(6, static_cast(data.c_size)); - stmt.bind_int(7, data.bits); - stmt.bind_blob(8, data.compressed_dict, - static_cast(data.compressed_dict_size)); - stmt.bind_int64(9, static_cast(data.num_lines)); - stmt.bind_int64(10, static_cast(data.first_line_num)); - stmt.bind_int64(11, static_cast(data.last_line_num)); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert checkpoint: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_metadata_record.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_metadata_record.cpp deleted file mode 100644 index 7b212d58..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_metadata_record.cpp +++ /dev/null @@ -1,33 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -void insert_file_metadata_record(const SqliteDatabase &db, int file_id, - std::size_t ckpt_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size) { - SqliteStmt stmt(db, - "INSERT INTO metadata(file_id, checkpoint_size, " - "total_lines, total_uc_size) VALUES(?, ?, ?, ?);"); - - stmt.bind_int(1, file_id); - stmt.bind_int64(2, static_cast(ckpt_size)); - stmt.bind_int64(3, static_cast(total_lines)); - stmt.bind_int64(4, static_cast(total_uc_size)); - - int result = sqlite3_step(stmt); - if (result != SQLITE_DONE) { - throw IndexerError( - IndexerError::Type::DATABASE_ERROR, - "Insert failed: " + std::string(sqlite3_errmsg(db.get()))); - } - DFTRACER_UTILS_LOG_DEBUG( - "Successfully inserted metadata for file_id %d: " - "checkpoint_size=%zu, " - "total_lines=%llu, total_uc_size=%llu", - file_id, ckpt_size, total_lines, total_uc_size); -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_record.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_record.cpp deleted file mode 100644 index e9890b0a..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/insert_file_record.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -void insert_file_record(const SqliteDatabase &db, - const std::string &gz_path_logical_path, - std::size_t bytes, std::time_t file_mtime, - std::uint64_t file_hash, int &db_file_id) { - SqliteStmt stmt(db, - "INSERT INTO files(logical_name, byte_size, " - "mtime_unix, hash) " - "VALUES(?, ?, ?, ?) " - "ON CONFLICT(logical_name) DO UPDATE SET " - "byte_size=excluded.byte_size, " - "mtime_unix=excluded.mtime_unix, " - "hash=excluded.hash " - "RETURNING id;"); - - stmt.bind_text(1, gz_path_logical_path); - stmt.bind_int64(2, static_cast(bytes)); - stmt.bind_int64(3, static_cast(file_mtime)); - stmt.bind_int64(4, static_cast(file_hash)); - - int rc = sqlite3_step(stmt); - if (rc != SQLITE_ROW && rc != SQLITE_DONE) { - throw IndexerError( - IndexerError::Type::DATABASE_ERROR, - "Insert failed: " + std::string(sqlite3_errmsg(db.get()))); - } - - db_file_id = sqlite3_column_int(stmt, 0); -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/queries.h b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/queries.h deleted file mode 100644 index 04f721eb..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/queries.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_GZIP_QUERIES_H -#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_GZIP_QUERIES_H - -#include -#include -#include -#include - -#include -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -using dftracer::utils::sqlite::SqliteDatabase; -using dftracer::utils::sqlite::SqliteStmt; - -void insert_file_record(const SqliteDatabase &db, - const std::string &gz_path_logical_path, - std::size_t bytes, std::time_t file_mtime, - std::uint64_t file_hash, int &db_file_id); -void insert_file_metadata_record(const SqliteDatabase &db, int file_id, - std::size_t ckpt_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size); -bool query_stored_file_info(const SqliteDatabase &db, - const std::string &gz_path, - std::uint64_t &stored_hash, - std::time_t &stored_mtime); - -struct InsertCheckpointData { - std::uint64_t idx; - std::uint64_t uc_offset; - std::uint64_t uc_size; - std::uint64_t c_size; - std::uint64_t c_offset; - int bits; - const void *compressed_dict; - std::size_t compressed_dict_size; - std::uint64_t num_lines; - std::uint64_t first_line_num; - std::uint64_t last_line_num; -}; -void insert_checkpoint_record(const SqliteDatabase &db, int file_id, - const InsertCheckpointData &data); - -bool query_schema_validity(const SqliteDatabase &db); -bool delete_file_record(const SqliteDatabase &db, int file_id); -std::uint64_t query_max_bytes(const SqliteDatabase &db, - const std::string &gz_path_logical_path); -std::uint64_t query_num_lines(const SqliteDatabase &db, - const std::string &gz_path_logical_path); -int query_file_id(const SqliteDatabase &db, - const std::string &gz_path_logical_path); -bool query_checkpoint(const SqliteDatabase &db, std::size_t target_offset, - int file_id, IndexerCheckpoint &checkpoint); -std::vector query_checkpoints(const SqliteDatabase &db, - int file_id); -std::vector query_checkpoints_for_line_range( - const SqliteDatabase &db, int file_id, std::uint64_t start_line, - std::uint64_t end_line); -std::uint64_t query_checkpoint_size(const SqliteDatabase &db, int file_id); - -} // namespace dftracer::utils::utilities::indexer::internal::gzip - -#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_GZIP_QUERIES_H diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint.cpp deleted file mode 100644 index 8c5678f3..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint.cpp +++ /dev/null @@ -1,76 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -bool query_checkpoint(const SqliteDatabase& db, std::size_t target_offset, - int file_id, IndexerCheckpoint& checkpoint) { - DFTRACER_UTILS_LOG_DEBUG( - "query_checkpoint called: target_offset=%zu, file_id=%d", target_offset, - file_id); - - // For target offset 0, always decompress from beginning of file (no - // checkpoint) - if (target_offset == 0) { - DFTRACER_UTILS_LOG_DEBUG( - "%s", "query_checkpoint: target_offset is 0, returning false"); - return false; - } - - if (file_id == -1) { - DFTRACER_UTILS_LOG_DEBUG( - "%s", "query_checkpoint: file_id is -1, returning false"); - return false; - } - - SqliteStmt stmt( - db, - "SELECT checkpoint_idx, uc_offset, uc_size, c_offset, c_size, bits, " - "dict_compressed, num_lines " - "FROM checkpoints WHERE file_id = ? AND uc_offset <= ? " - "ORDER BY uc_offset DESC LIMIT 1"); - bool found = false; - - stmt.bind_int(1, file_id); - stmt.bind_int64(2, static_cast(target_offset)); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - checkpoint.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - checkpoint.uc_offset = - static_cast(sqlite3_column_int64(stmt, 1)); - checkpoint.uc_size = - static_cast(sqlite3_column_int64(stmt, 2)); - checkpoint.c_offset = - static_cast(sqlite3_column_int64(stmt, 3)); - checkpoint.c_size = - static_cast(sqlite3_column_int64(stmt, 4)); - checkpoint.bits = sqlite3_column_int(stmt, 5); - std::size_t dict_size = - static_cast(sqlite3_column_bytes(stmt, 6)); - checkpoint.dict_compressed.resize(dict_size); - std::memcpy(checkpoint.dict_compressed.data(), - sqlite3_column_blob(stmt, 6), dict_size); - checkpoint.num_lines = - static_cast(sqlite3_column_int64(stmt, 7)); - found = true; - - DFTRACER_UTILS_LOG_DEBUG( - "query_checkpoint: found checkpoint idx=%llu, uc_offset=%llu, " - "c_offset=%llu, bits=%d", - checkpoint.checkpoint_idx, checkpoint.uc_offset, - checkpoint.c_offset, checkpoint.bits); - } else { - DFTRACER_UTILS_LOG_DEBUG( - "query_checkpoint: no checkpoint found for target_offset=%zu, " - "file_id=%d", - target_offset, file_id); - } - - return found; -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint_size.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint_size.cpp deleted file mode 100644 index 4a205c61..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoint_size.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -std::uint64_t query_checkpoint_size(const SqliteDatabase &db, int file_id) { - SqliteStmt stmt(db, - "SELECT checkpoint_size FROM metadata WHERE file_id = ?"); - stmt.bind_int(1, file_id); - std::uint64_t ckpt_size = 0; - - if (sqlite3_step(stmt) == SQLITE_ROW) { - ckpt_size = static_cast(sqlite3_column_int64(stmt, 0)); - } - - return ckpt_size; -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoints.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoints.cpp deleted file mode 100644 index 1e25a74b..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_checkpoints.cpp +++ /dev/null @@ -1,105 +0,0 @@ -#include -#include - -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -std::vector query_checkpoints(const SqliteDatabase &db, - int file_id) { - std::vector checkpoints; - - SqliteStmt stmt( - db, - "SELECT checkpoint_idx, uc_offset, uc_size, c_offset, c_size, bits, " - "dict_compressed, num_lines, first_line_num, last_line_num " - "FROM checkpoints WHERE file_id = ? ORDER BY uc_offset"); - - stmt.bind_int(1, file_id); - - while (sqlite3_step(stmt) == SQLITE_ROW) { - IndexerCheckpoint checkpoint; - checkpoint.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - checkpoint.uc_offset = - static_cast(sqlite3_column_int64(stmt, 1)); - checkpoint.uc_size = - static_cast(sqlite3_column_int64(stmt, 2)); - checkpoint.c_offset = - static_cast(sqlite3_column_int64(stmt, 3)); - checkpoint.c_size = - static_cast(sqlite3_column_int64(stmt, 4)); - checkpoint.bits = sqlite3_column_int(stmt, 5); - - std::size_t dict_size = - static_cast(sqlite3_column_bytes(stmt, 6)); - checkpoint.dict_compressed.resize(dict_size); - std::memcpy(checkpoint.dict_compressed.data(), - sqlite3_column_blob(stmt, 6), dict_size); - - checkpoint.num_lines = - static_cast(sqlite3_column_int64(stmt, 7)); - checkpoint.first_line_num = - static_cast(sqlite3_column_int64(stmt, 8)); - checkpoint.last_line_num = - static_cast(sqlite3_column_int64(stmt, 9)); - - checkpoints.push_back(std::move(checkpoint)); - } - - return checkpoints; -} - -std::vector query_checkpoints_for_line_range( - const SqliteDatabase &db, int file_id, std::uint64_t start_line, - std::uint64_t end_line) { - std::vector checkpoints; - - SqliteStmt stmt( - db, - "SELECT checkpoint_idx, uc_offset, uc_size, c_offset, c_size, bits, " - "dict_compressed, num_lines, first_line_num, last_line_num " - "FROM checkpoints WHERE file_id = ? AND " - "(first_line_num <= ? AND last_line_num >= ?) OR " - "(first_line_num <= ? AND last_line_num >= ?) " - "ORDER BY uc_offset"); - - stmt.bind_int(1, file_id); - stmt.bind_int64(2, static_cast(end_line)); - stmt.bind_int64(3, static_cast(start_line)); - stmt.bind_int64(4, static_cast(start_line)); - stmt.bind_int64(5, static_cast(end_line)); - - while (sqlite3_step(stmt) == SQLITE_ROW) { - IndexerCheckpoint checkpoint; - checkpoint.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - checkpoint.uc_offset = - static_cast(sqlite3_column_int64(stmt, 1)); - checkpoint.uc_size = - static_cast(sqlite3_column_int64(stmt, 2)); - checkpoint.c_offset = - static_cast(sqlite3_column_int64(stmt, 3)); - checkpoint.c_size = - static_cast(sqlite3_column_int64(stmt, 4)); - checkpoint.bits = sqlite3_column_int(stmt, 5); - - size_t dict_size = static_cast(sqlite3_column_bytes(stmt, 6)); - checkpoint.dict_compressed.resize(dict_size); - std::memcpy(checkpoint.dict_compressed.data(), - sqlite3_column_blob(stmt, 6), dict_size); - - checkpoint.num_lines = - static_cast(sqlite3_column_int64(stmt, 7)); - checkpoint.first_line_num = - static_cast(sqlite3_column_int64(stmt, 8)); - checkpoint.last_line_num = - static_cast(sqlite3_column_int64(stmt, 9)); - - checkpoints.push_back(std::move(checkpoint)); - } - - return checkpoints; -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_file_id.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_file_id.cpp deleted file mode 100644 index 1f443a9f..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_file_id.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -int query_file_id(const SqliteDatabase &db, - const std::string &gz_path_logical_path) { - SqliteStmt stmt(db, "SELECT id FROM files WHERE logical_name = ? LIMIT 1"); - int file_id = -1; - - stmt.bind_text(1, gz_path_logical_path); - if (sqlite3_step(stmt) == SQLITE_ROW) { - file_id = sqlite3_column_int(stmt, 0); - } - - return file_id; -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_max_bytes.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_max_bytes.cpp deleted file mode 100644 index 2cd0f8c1..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_max_bytes.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -std::uint64_t query_max_bytes(const SqliteDatabase &db, - const std::string &gz_path_logical_path) { - // Primary: metadata table has the authoritative total uncompressed size - SqliteStmt metadata_stmt( - db, - "SELECT total_uc_size FROM metadata WHERE file_id = " - "(SELECT id FROM files WHERE logical_name = ? LIMIT 1)"); - metadata_stmt.bind_text(1, gz_path_logical_path); - if (sqlite3_step(metadata_stmt) == SQLITE_ROW) { - std::uint64_t total = - static_cast(sqlite3_column_int64(metadata_stmt, 0)); - if (total > 0) { - return total; - } - } - - // Fallback: derive from checkpoints if metadata is missing - SqliteStmt stmt( - db, - "SELECT MAX(uc_offset + uc_size) FROM checkpoints WHERE file_id = " - "(SELECT id FROM files WHERE logical_name = ? LIMIT 1)"); - std::uint64_t max_bytes = 0; - stmt.bind_text(1, gz_path_logical_path); - if (sqlite3_step(stmt) == SQLITE_ROW) { - max_bytes = static_cast(sqlite3_column_int64(stmt, 0)); - } - - return max_bytes; -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_num_lines.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_num_lines.cpp deleted file mode 100644 index 614949a6..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_num_lines.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -std::uint64_t query_num_lines(const SqliteDatabase &db, - const std::string &gz_path_logical_path) { - SqliteStmt stmt(db, - "SELECT total_lines FROM metadata WHERE file_id = " - "(SELECT id FROM files WHERE logical_name = ? LIMIT 1)"); - std::uint64_t total_lines = 0; - - stmt.bind_text(1, gz_path_logical_path); - if (sqlite3_step(stmt) == SQLITE_ROW) { - total_lines = static_cast(sqlite3_column_int64(stmt, 0)); - } - - return total_lines; -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_schema_validity.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_schema_validity.cpp deleted file mode 100644 index 43b679f3..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_schema_validity.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -bool query_schema_validity(const SqliteDatabase &db) { - SqliteStmt stmt(db, - "SELECT name FROM sqlite_master WHERE type='table' AND " - "name IN ('checkpoints', 'metadata', 'files')"); - int table_count = 0; - - while (sqlite3_step(stmt) == SQLITE_ROW) { - table_count++; - } - - return table_count >= 3; -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_stored_file_info.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_stored_file_info.cpp deleted file mode 100644 index f891f9bb..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/gzip/queries/query_stored_file_info.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::gzip { - -bool query_stored_file_info(const SqliteDatabase &db, - const std::string &gz_path, - std::uint64_t &stored_hash, time_t &stored_mtime) { - SqliteStmt stmt(db, - "SELECT hash, mtime_unix FROM files WHERE " - "logical_name = ? LIMIT 1"); - - stmt.bind_text(1, gz_path); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - std::uint64_t hash = - static_cast(sqlite3_column_int64(stmt, 0)); - if (hash == 0) { - return false; - } - stored_hash = hash; - stored_mtime = static_cast(sqlite3_column_int64(stmt, 1)); - return true; - } - - return false; -} - -} // namespace dftracer::utils::utilities::indexer::internal::gzip diff --git a/src/dftracer/utils/utilities/indexer/internal/helpers.cpp b/src/dftracer/utils/utilities/indexer/internal/helpers.cpp index 87789f53..e16da3ee 100644 --- a/src/dftracer/utils/utilities/indexer/internal/helpers.cpp +++ b/src/dftracer/utils/utilities/indexer/internal/helpers.cpp @@ -23,6 +23,21 @@ std::string get_logical_path(std::string_view path) { return fs_path.filename().string(); } +std::string normalize_index_root(std::string_view path) { + fs::path input{std::string(path)}; + if (input.filename() == ".dftindex") { + return input.string(); + } + if (input.parent_path().filename() == ".dftindex") { + return input.parent_path().string(); + } + if (input.extension() == ".idx" || input.extension() == ".pidx" || + input.has_extension()) { + return (input.parent_path() / ".dftindex").string(); + } + return (input / ".dftindex").string(); +} + time_t get_file_modification_time(const std::string &file_path) { #if defined(DFTRACER_UTILS_USE_STD_FS) // Use std::filesystem when available and working @@ -118,12 +133,10 @@ std::uint64_t file_size_bytes(const std::string &path) { ::close(fd); if (pos < 0) return 0; return static_cast(pos); - if (pos < 0) return 0; - return static_cast(pos); } -bool index_exists_and_valid(const std::string &idx_path) { - return fs::exists(idx_path) && fs::is_regular_file(idx_path); +bool index_exists_and_valid(const std::string &index_path) { + return fs::exists(index_path) && fs::is_directory(index_path); } } // namespace dftracer::utils::utilities::indexer::internal diff --git a/src/dftracer/utils/utilities/indexer/internal/helpers.h b/src/dftracer/utils/utilities/indexer/internal/helpers.h index 024072f3..05504215 100644 --- a/src/dftracer/utils/utilities/indexer/internal/helpers.h +++ b/src/dftracer/utils/utilities/indexer/internal/helpers.h @@ -9,10 +9,11 @@ namespace dftracer::utils::utilities::indexer::internal { std::string get_logical_path(std::string_view path); +std::string normalize_index_root(std::string_view path); time_t get_file_modification_time(const std::string &file_path); std::uint64_t calculate_file_hash(const std::string &file_path); std::uint64_t file_size_bytes(const std::string &path); -bool index_exists_and_valid(const std::string &idx_path); +bool index_exists_and_valid(const std::string &index_path); } // namespace dftracer::utils::utilities::indexer::internal diff --git a/src/dftracer/utils/utilities/indexer/internal/indexer_c.cpp b/src/dftracer/utils/utilities/indexer/internal/indexer_c.cpp index 6b32f2d8..637ab38e 100644 --- a/src/dftracer/utils/utilities/indexer/internal/indexer_c.cpp +++ b/src/dftracer/utils/utilities/indexer/internal/indexer_c.cpp @@ -18,10 +18,10 @@ static std::shared_ptr *cast_indexer(dft_indexer_handle_t indexer) { } dft_indexer_handle_t dft_indexer_create(const char *gz_path, - const char *idx_path, + const char *index_path, uint64_t checkpoint_size, int force_rebuild) { - if (!gz_path || !idx_path || checkpoint_size == 0) { + if (!gz_path || !index_path || checkpoint_size == 0) { DFTRACER_UTILS_LOG_ERROR("%s", "Invalid parameters for indexer creation"); return nullptr; @@ -29,7 +29,7 @@ dft_indexer_handle_t dft_indexer_create(const char *gz_path, try { auto indexer = IndexerFactory::create( - gz_path, idx_path, checkpoint_size, force_rebuild != 0); + gz_path, index_path, checkpoint_size, force_rebuild != 0); if (indexer) { return static_cast( new std::shared_ptr(indexer)); diff --git a/src/dftracer/utils/utilities/indexer/internal/indexer_factory.cpp b/src/dftracer/utils/utilities/indexer/internal/indexer_factory.cpp index 5ffc9c67..ab558a18 100644 --- a/src/dftracer/utils/utilities/indexer/internal/indexer_factory.cpp +++ b/src/dftracer/utils/utilities/indexer/internal/indexer_factory.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -7,12 +8,13 @@ namespace dftracer::utils::utilities::indexer::internal { std::shared_ptr IndexerFactory::create(const std::string &archive_path, - const std::string &idx_path, + const std::string &index_path, std::uint64_t checkpoint_size, bool force) { ArchiveFormat format = FormatDetector::detect(archive_path); - std::string final_idx_path = - idx_path.empty() ? generate_index_path(archive_path, format) : idx_path; + std::string final_idx_path = index_path.empty() + ? generate_index_path(archive_path, format) + : index_path; switch (format) { case ArchiveFormat::GZIP: @@ -38,25 +40,23 @@ ArchiveFormat IndexerFactory::detect_format(const std::string &archive_path) { std::string IndexerFactory::generate_index_path(const std::string &archive_path, ArchiveFormat format) { - // Auto-detect format if not specified if (format == ArchiveFormat::UNKNOWN) { format = FormatDetector::detect(archive_path); } switch (format) { case ArchiveFormat::GZIP: - return archive_path + ".idx"; - case ArchiveFormat::TAR_GZ: - return archive_path + ".idx.tar"; + return composites::dft::internal::determine_index_path(archive_path, + ""); case ArchiveFormat::UNKNOWN: default: - // Fallback to generic .idx extension DFTRACER_UTILS_LOG_WARN( - "Unknown format for %s, using generic .idx extension", + "Unknown format for %s, using root-local .dftindex", archive_path.c_str()); - return archive_path + ".idx"; + return composites::dft::internal::determine_index_path(archive_path, + ""); } } diff --git a/src/dftracer/utils/utilities/indexer/internal/sqlite/database.h b/src/dftracer/utils/utilities/indexer/internal/sqlite/database.h deleted file mode 100644 index 9fff3dfa..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/sqlite/database.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_SQLITE_DATABASE_H -#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_SQLITE_DATABASE_H - -// Forwarding header: SqliteDatabase has moved to core/sqlite/. -// This header re-exports it into the indexer::internal namespace -// for backward compatibility. -#include - -namespace dftracer::utils::utilities::indexer::internal { -using dftracer::utils::sqlite::SqliteDatabase; -} // namespace dftracer::utils::utilities::indexer::internal - -#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_SQLITE_DATABASE_H diff --git a/src/dftracer/utils/utilities/indexer/internal/sqlite/statement.h b/src/dftracer/utils/utilities/indexer/internal/sqlite/statement.h deleted file mode 100644 index 6e831833..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/sqlite/statement.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_SQLITE_STATEMENT_H -#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_SQLITE_STATEMENT_H - -// Forwarding header: SqliteStmt has moved to core/sqlite/. -// This header re-exports it into the indexer::internal namespace -// for backward compatibility. -#include - -namespace dftracer::utils::utilities::indexer::internal { -using dftracer::utils::sqlite::SqliteStmt; -} // namespace dftracer::utils::utilities::indexer::internal - -#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_SQLITE_STATEMENT_H diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_metadata_record.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_metadata_record.cpp deleted file mode 100644 index 64f8e3d6..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_metadata_record.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -void insert_archive_metadata_record(const SqliteDatabase &db, int archive_id, - std::size_t ckpt_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size) { - SqliteStmt stmt(db, - "INSERT INTO metadata(archive_id, checkpoint_size, " - "total_lines, total_uc_size) " - "VALUES(?, ?, ?, ?) " - "ON CONFLICT(archive_id) DO UPDATE SET " - "checkpoint_size=excluded.checkpoint_size, " - "total_lines=excluded.total_lines, " - "total_uc_size=excluded.total_uc_size;"); - - stmt.bind_int(1, archive_id); - stmt.bind_int64(2, static_cast(ckpt_size)); - stmt.bind_int64(3, static_cast(total_lines)); - stmt.bind_int64(4, static_cast(total_uc_size)); - - int rc = sqlite3_step(stmt); - if (rc != SQLITE_ROW && rc != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Insert archive metadata failed: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::indexer::internal::tar \ No newline at end of file diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_record.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_record.cpp deleted file mode 100644 index 0917fcbe..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_archive_record.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -void insert_archive_record(const SqliteDatabase &db, int file_id, - const std::string &archive_name, - std::uint64_t uncompressed_size, - std::uint64_t total_files, int &archive_id) { - SqliteStmt stmt(db, - "INSERT INTO tar_archives(file_id, archive_name, " - "uncompressed_size, total_files) " - "VALUES(?, ?, ?, ?) " - "ON CONFLICT(file_id) DO UPDATE SET " - "archive_name=excluded.archive_name, " - "uncompressed_size=excluded.uncompressed_size, " - "total_files=excluded.total_files " - "RETURNING id;"); - - stmt.bind_int(1, file_id); - stmt.bind_text(2, archive_name); - stmt.bind_int64(3, static_cast(uncompressed_size)); - stmt.bind_int64(4, static_cast(total_files)); - - int rc = sqlite3_step(stmt); - if (rc != SQLITE_ROW && rc != SQLITE_DONE) { - throw IndexerError( - IndexerError::Type::DATABASE_ERROR, - "Insert archive failed: " + std::string(sqlite3_errmsg(db.get()))); - } - - archive_id = sqlite3_column_int(stmt, 0); -} - -} // namespace dftracer::utils::utilities::indexer::internal::tar \ No newline at end of file diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_file_record.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_file_record.cpp deleted file mode 100644 index 5795f667..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_file_record.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -void insert_file_record(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path, - std::size_t bytes, std::time_t file_mtime, - std::uint64_t file_hash, int &db_file_id) { - SqliteStmt stmt(db, - "INSERT INTO files(logical_name, byte_size, " - "mtime_unix, hash) " - "VALUES(?, ?, ?, ?) " - "ON CONFLICT(logical_name) DO UPDATE SET " - "byte_size=excluded.byte_size, " - "mtime_unix=excluded.mtime_unix, " - "hash=excluded.hash " - "RETURNING id;"); - - stmt.bind_text(1, tar_gz_path_logical_path); - stmt.bind_int64(2, static_cast(bytes)); - stmt.bind_int64(3, static_cast(file_mtime)); - stmt.bind_int64(4, static_cast(file_hash)); - - int rc = sqlite3_step(stmt); - if (rc != SQLITE_ROW && rc != SQLITE_DONE) { - throw IndexerError( - IndexerError::Type::DATABASE_ERROR, - "Insert failed: " + std::string(sqlite3_errmsg(db.get()))); - } - - db_file_id = sqlite3_column_int(stmt, 0); -} - -} // namespace dftracer::utils::utilities::indexer::internal::tar diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_checkpoint_record.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_checkpoint_record.cpp deleted file mode 100644 index 9185fea0..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_checkpoint_record.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -void insert_tar_checkpoint_record(const SqliteDatabase &db, int archive_id, - const InsertTarCheckpointData &data) { - SqliteStmt stmt( - db, - "INSERT INTO tar_gzip_checkpoints(archive_id, checkpoint_idx, " - "uc_offset, uc_size, c_offset, c_size, bits, dict_compressed, " - "num_lines, first_line_num, last_line_num, tar_files_count) " - "VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"); - - stmt.bind_int(1, archive_id); - stmt.bind_int64(2, static_cast(data.idx)); - stmt.bind_int64(3, static_cast(data.uc_offset)); - stmt.bind_int64(4, static_cast(data.uc_size)); - stmt.bind_int64(5, static_cast(data.c_offset)); - stmt.bind_int64(6, static_cast(data.c_size)); - stmt.bind_int(7, data.bits); - stmt.bind_blob(8, data.compressed_dict, - static_cast(data.compressed_dict_size)); - stmt.bind_int64(9, static_cast(data.num_lines)); - stmt.bind_int64(10, static_cast(data.first_line_num)); - stmt.bind_int64(11, static_cast(data.last_line_num)); - stmt.bind_int64(12, static_cast(data.tar_files_count)); - - int rc = sqlite3_step(stmt); - if (rc != SQLITE_ROW && rc != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Insert TAR checkpoint failed: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::indexer::internal::tar diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_file_record.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_file_record.cpp deleted file mode 100644 index c5d2f2a5..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/insert_tar_file_record.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -void insert_tar_file_record(const SqliteDatabase &db, int archive_id, - const InsertTarFileData &data) { - SqliteStmt stmt(db, - "INSERT INTO tar_files(archive_id, file_name, file_size, " - "file_mtime, typeflag, data_offset, uncompressed_offset) " - "VALUES(?, ?, ?, ?, ?, ?, ?);"); - - stmt.bind_int(1, archive_id); - stmt.bind_text(2, data.file_name); - stmt.bind_int64(3, static_cast(data.file_size)); - stmt.bind_int64(4, static_cast(data.file_mtime)); - stmt.bind_text(5, std::string(1, data.typeflag)); - stmt.bind_int64(6, static_cast(data.data_offset)); - stmt.bind_int64(7, static_cast(data.uncompressed_offset)); - - int rc = sqlite3_step(stmt); - if (rc != SQLITE_ROW && rc != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Insert TAR file record failed: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::indexer::internal::tar \ No newline at end of file diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/queries.h b/src/dftracer/utils/utilities/indexer/internal/tar/queries/queries.h deleted file mode 100644 index 395729e0..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/queries.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TAR_QUERIES_H -#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TAR_QUERIES_H - -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -using dftracer::utils::sqlite::SqliteDatabase; -using dftracer::utils::sqlite::SqliteStmt; - -// File and archive management -void insert_file_record(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path, - std::size_t bytes, std::time_t file_mtime, - std::uint64_t file_sha256, int &db_file_id); - -void insert_archive_record(const SqliteDatabase &db, int file_id, - const std::string &archive_name, - std::uint64_t uncompressed_size, - std::uint64_t total_files, int &archive_id); - -void insert_archive_metadata_record(const SqliteDatabase &db, int archive_id, - std::size_t ckpt_size, - std::uint64_t total_lines, - std::uint64_t total_uc_size); - -bool query_stored_file_info(const SqliteDatabase &db, - const std::string &tar_gz_path, - std::uint64_t &stored_hash, - std::time_t &stored_mtime); - -// TAR file entries -struct InsertTarFileData { - std::string file_name; - std::uint64_t file_size; - std::uint64_t file_mtime; - char typeflag; - std::uint64_t data_offset; - std::uint64_t uncompressed_offset; -}; - -void insert_tar_file_record(const SqliteDatabase &db, int archive_id, - const InsertTarFileData &data); - -std::vector query_tar_files(const SqliteDatabase &db, - int archive_id); - -bool query_tar_file(const SqliteDatabase &db, int archive_id, - const std::string &file_name, - TarIndexer::TarFileInfo &file_info); - -std::vector query_tar_files_in_range( - const SqliteDatabase &db, int archive_id, std::uint64_t start_offset, - std::uint64_t end_offset); - -// GZIP checkpoints for TAR archives -struct InsertTarCheckpointData { - std::uint64_t idx; - std::uint64_t uc_offset; - std::uint64_t uc_size; - std::uint64_t c_size; - std::uint64_t c_offset; - int bits; - const void *compressed_dict; - std::size_t compressed_dict_size; - std::uint64_t num_lines; - std::uint64_t first_line_num; - std::uint64_t last_line_num; - std::uint64_t tar_files_count; -}; - -void insert_tar_checkpoint_record(const SqliteDatabase &db, int archive_id, - const InsertTarCheckpointData &data); - -// Database queries -bool query_schema_validity(const SqliteDatabase &db); -bool delete_archive_record(const SqliteDatabase &db, int archive_id); -std::uint64_t query_max_bytes(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path); -std::uint64_t query_num_lines(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path); -std::uint64_t query_num_files(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path); -std::string query_archive_name(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path); -int query_archive_id(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path); - -bool query_tar_checkpoint(const SqliteDatabase &db, std::size_t target_offset, - int archive_id, IndexerCheckpoint &checkpoint); -std::vector query_tar_checkpoints(const SqliteDatabase &db, - int archive_id); -std::vector query_tar_checkpoints_for_line_range( - const SqliteDatabase &db, int archive_id, std::uint64_t start_line, - std::uint64_t end_line); -std::uint64_t query_checkpoint_size(const SqliteDatabase &db, int archive_id); - -} // namespace dftracer::utils::utilities::indexer::internal::tar - -#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TAR_QUERIES_H diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_archive_id.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_archive_id.cpp deleted file mode 100644 index 2ea994ba..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_archive_id.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -int query_archive_id(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path) { - SqliteStmt stmt(db, - "SELECT ta.id " - "FROM tar_archives ta " - "JOIN files f ON ta.file_id = f.id " - "WHERE f.logical_name = ?;"); - - stmt.bind_text(1, tar_gz_path_logical_path); - - int rc = sqlite3_step(stmt); - if (rc == SQLITE_ROW) { - return sqlite3_column_int(stmt, 0); - } else if (rc == SQLITE_DONE) { - return -1; // Not found - } else { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Query archive ID failed: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -} // namespace dftracer::utils::utilities::indexer::internal::tar \ No newline at end of file diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_metadata.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_metadata.cpp deleted file mode 100644 index 413f3dda..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_metadata.cpp +++ /dev/null @@ -1,143 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -std::uint64_t query_max_bytes(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path) { - SqliteStmt stmt(db, - "SELECT m.total_uc_size " - "FROM metadata m " - "JOIN tar_archives ta ON m.archive_id = ta.id " - "JOIN files f ON ta.file_id = f.id " - "WHERE f.logical_name = ?;"); - - stmt.bind_text(1, tar_gz_path_logical_path); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - return static_cast(sqlite3_column_int64(stmt, 0)); - } - - return 0; -} - -std::uint64_t query_num_lines(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path) { - SqliteStmt stmt(db, - "SELECT m.total_lines " - "FROM metadata m " - "JOIN tar_archives ta ON m.archive_id = ta.id " - "JOIN files f ON ta.file_id = f.id " - "WHERE f.logical_name = ?;"); - - stmt.bind_text(1, tar_gz_path_logical_path); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - return static_cast(sqlite3_column_int64(stmt, 0)); - } - - return 0; -} - -std::uint64_t query_num_files(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path) { - SqliteStmt stmt(db, - "SELECT ta.total_files " - "FROM tar_archives ta " - "JOIN files f ON ta.file_id = f.id " - "WHERE f.logical_name = ?;"); - - stmt.bind_text(1, tar_gz_path_logical_path); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - return static_cast(sqlite3_column_int64(stmt, 0)); - } - - return 0; -} - -std::string query_archive_name(const SqliteDatabase &db, - const std::string &tar_gz_path_logical_path) { - SqliteStmt stmt(db, - "SELECT ta.archive_name " - "FROM tar_archives ta " - "JOIN files f ON ta.file_id = f.id " - "WHERE f.logical_name = ?;"); - - stmt.bind_text(1, tar_gz_path_logical_path); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - const char *name = - reinterpret_cast(sqlite3_column_text(stmt, 0)); - return name ? std::string(name) : ""; - } - - return ""; -} - -std::uint64_t query_checkpoint_size(const SqliteDatabase &db, int archive_id) { - SqliteStmt stmt(db, - "SELECT checkpoint_size " - "FROM metadata " - "WHERE archive_id = ?;"); - - stmt.bind_int(1, archive_id); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - return static_cast(sqlite3_column_int64(stmt, 0)); - } - - return 0; -} - -bool query_stored_file_info(const SqliteDatabase &db, - const std::string &tar_gz_path, - std::uint64_t &stored_hash, - std::time_t &stored_mtime) { - SqliteStmt stmt(db, - "SELECT hash, mtime_unix " - "FROM files " - "WHERE logical_name = ?;"); - - stmt.bind_text(1, tar_gz_path); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - std::uint64_t hash = - static_cast(sqlite3_column_int64(stmt, 0)); - if (hash == 0) { - return false; // No valid hash stored - } - stored_hash = hash; - stored_mtime = static_cast(sqlite3_column_int64(stmt, 1)); - return true; - } - - return false; -} - -bool query_schema_validity(const SqliteDatabase &db) { - try { - SqliteStmt stmt(db, - "SELECT COUNT(*) FROM sqlite_master WHERE type='table' " - "AND name IN ('files', 'tar_archives', 'tar_files', " - "'tar_gzip_checkpoints', 'metadata');"); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - int table_count = sqlite3_column_int(stmt, 0); - return table_count == 5; // Should have all 5 tables - } - } catch (...) { - return false; - } - return false; -} - -bool delete_archive_record(const SqliteDatabase &db, int archive_id) { - SqliteStmt stmt(db, "DELETE FROM tar_archives WHERE id = ?;"); - stmt.bind_int(1, archive_id); - - int rc = sqlite3_step(stmt); - return rc == SQLITE_DONE; -} - -} // namespace dftracer::utils::utilities::indexer::internal::tar diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_checkpoints.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_checkpoints.cpp deleted file mode 100644 index 62a47d31..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_checkpoints.cpp +++ /dev/null @@ -1,165 +0,0 @@ -#include -#include -#include - -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -bool query_tar_checkpoint(const SqliteDatabase &db, std::size_t target_offset, - int archive_id, IndexerCheckpoint &checkpoint) { - SqliteStmt stmt( - db, - "SELECT checkpoint_idx, uc_offset, uc_size, c_offset, c_size, " - "bits, dict_compressed, num_lines, first_line_num, last_line_num " - "FROM tar_gzip_checkpoints " - "WHERE archive_id = ? AND uc_offset <= ? " - "ORDER BY uc_offset DESC " - "LIMIT 1;"); - - stmt.bind_int(1, archive_id); - stmt.bind_int64(2, static_cast(target_offset)); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - checkpoint.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - checkpoint.uc_offset = - static_cast(sqlite3_column_int64(stmt, 1)); - checkpoint.uc_size = - static_cast(sqlite3_column_int64(stmt, 2)); - checkpoint.c_offset = - static_cast(sqlite3_column_int64(stmt, 3)); - checkpoint.c_size = - static_cast(sqlite3_column_int64(stmt, 4)); - checkpoint.bits = sqlite3_column_int(stmt, 5); - - // Copy compressed dictionary - const void *dict_data = sqlite3_column_blob(stmt, 6); - int dict_size = sqlite3_column_bytes(stmt, 6); - if (dict_data && dict_size > 0) { - checkpoint.dict_compressed.resize(dict_size); - std::memcpy(checkpoint.dict_compressed.data(), dict_data, - dict_size); - } - - checkpoint.num_lines = - static_cast(sqlite3_column_int64(stmt, 7)); - checkpoint.first_line_num = - static_cast(sqlite3_column_int64(stmt, 8)); - checkpoint.last_line_num = - static_cast(sqlite3_column_int64(stmt, 9)); - - return true; - } - - return false; -} - -std::vector query_tar_checkpoints(const SqliteDatabase &db, - int archive_id) { - std::vector checkpoints; - - SqliteStmt stmt( - db, - "SELECT checkpoint_idx, uc_offset, uc_size, c_offset, c_size, " - "bits, dict_compressed, num_lines, first_line_num, last_line_num " - "FROM tar_gzip_checkpoints " - "WHERE archive_id = ? " - "ORDER BY checkpoint_idx;"); - - stmt.bind_int(1, archive_id); - - while (sqlite3_step(stmt) == SQLITE_ROW) { - IndexerCheckpoint checkpoint; - checkpoint.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - checkpoint.uc_offset = - static_cast(sqlite3_column_int64(stmt, 1)); - checkpoint.uc_size = - static_cast(sqlite3_column_int64(stmt, 2)); - checkpoint.c_offset = - static_cast(sqlite3_column_int64(stmt, 3)); - checkpoint.c_size = - static_cast(sqlite3_column_int64(stmt, 4)); - checkpoint.bits = sqlite3_column_int(stmt, 5); - - // Copy compressed dictionary - const void *dict_data = sqlite3_column_blob(stmt, 6); - int dict_size = sqlite3_column_bytes(stmt, 6); - if (dict_data && dict_size > 0) { - checkpoint.dict_compressed.resize(dict_size); - std::memcpy(checkpoint.dict_compressed.data(), dict_data, - dict_size); - } - - checkpoint.num_lines = - static_cast(sqlite3_column_int64(stmt, 7)); - checkpoint.first_line_num = - static_cast(sqlite3_column_int64(stmt, 8)); - checkpoint.last_line_num = - static_cast(sqlite3_column_int64(stmt, 9)); - - checkpoints.push_back(checkpoint); - } - - return checkpoints; -} - -std::vector query_tar_checkpoints_for_line_range( - const SqliteDatabase &db, int archive_id, std::uint64_t start_line, - std::uint64_t end_line) { - std::vector checkpoints; - - SqliteStmt stmt( - db, - "SELECT checkpoint_idx, uc_offset, uc_size, c_offset, c_size, " - "bits, dict_compressed, num_lines, first_line_num, last_line_num " - "FROM tar_gzip_checkpoints " - "WHERE archive_id = ? AND " - "((first_line_num <= ? AND last_line_num >= ?) OR " - " (first_line_num <= ? AND last_line_num >= ?)) " - "ORDER BY checkpoint_idx;"); - - stmt.bind_int(1, archive_id); - stmt.bind_int64(2, static_cast(start_line)); - stmt.bind_int64(3, static_cast(start_line)); - stmt.bind_int64(4, static_cast(end_line)); - stmt.bind_int64(5, static_cast(end_line)); - - while (sqlite3_step(stmt) == SQLITE_ROW) { - IndexerCheckpoint checkpoint; - checkpoint.checkpoint_idx = - static_cast(sqlite3_column_int64(stmt, 0)); - checkpoint.uc_offset = - static_cast(sqlite3_column_int64(stmt, 1)); - checkpoint.uc_size = - static_cast(sqlite3_column_int64(stmt, 2)); - checkpoint.c_offset = - static_cast(sqlite3_column_int64(stmt, 3)); - checkpoint.c_size = - static_cast(sqlite3_column_int64(stmt, 4)); - checkpoint.bits = sqlite3_column_int(stmt, 5); - - // Copy compressed dictionary - const void *dict_data = sqlite3_column_blob(stmt, 6); - int dict_size = sqlite3_column_bytes(stmt, 6); - if (dict_data && dict_size > 0) { - checkpoint.dict_compressed.resize(dict_size); - std::memcpy(checkpoint.dict_compressed.data(), dict_data, - dict_size); - } - - checkpoint.num_lines = - static_cast(sqlite3_column_int64(stmt, 7)); - checkpoint.first_line_num = - static_cast(sqlite3_column_int64(stmt, 8)); - checkpoint.last_line_num = - static_cast(sqlite3_column_int64(stmt, 9)); - - checkpoints.push_back(checkpoint); - } - - return checkpoints; -} - -} // namespace dftracer::utils::utilities::indexer::internal::tar \ No newline at end of file diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_files.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_files.cpp deleted file mode 100644 index 98098908..00000000 --- a/src/dftracer/utils/utilities/indexer/internal/tar/queries/query_tar_files.cpp +++ /dev/null @@ -1,119 +0,0 @@ -#include -#include - -namespace dftracer::utils::utilities::indexer::internal::tar { - -std::vector query_tar_files(const SqliteDatabase &db, - int archive_id) { - std::vector files; - - SqliteStmt stmt(db, - "SELECT file_name, file_size, file_mtime, typeflag, " - "data_offset, uncompressed_offset " - "FROM tar_files " - "WHERE archive_id = ? " - "ORDER BY uncompressed_offset;"); - - stmt.bind_int(1, archive_id); - - while (sqlite3_step(stmt) == SQLITE_ROW) { - TarIndexer::TarFileInfo file_info; - file_info.file_name = - reinterpret_cast(sqlite3_column_text(stmt, 0)); - file_info.file_size = - static_cast(sqlite3_column_int64(stmt, 1)); - file_info.file_mtime = - static_cast(sqlite3_column_int64(stmt, 2)); - - const char *typeflag_str = - reinterpret_cast(sqlite3_column_text(stmt, 3)); - file_info.typeflag = typeflag_str ? typeflag_str[0] : '0'; - - file_info.data_offset = - static_cast(sqlite3_column_int64(stmt, 4)); - file_info.uncompressed_offset = - static_cast(sqlite3_column_int64(stmt, 5)); - - files.push_back(file_info); - } - - return files; -} - -bool query_tar_file(const SqliteDatabase &db, int archive_id, - const std::string &file_name, - TarIndexer::TarFileInfo &file_info) { - SqliteStmt stmt(db, - "SELECT file_name, file_size, file_mtime, typeflag, " - "data_offset, uncompressed_offset " - "FROM tar_files " - "WHERE archive_id = ? AND file_name = ?;"); - - stmt.bind_int(1, archive_id); - stmt.bind_text(2, file_name); - - if (sqlite3_step(stmt) == SQLITE_ROW) { - file_info.file_name = - reinterpret_cast(sqlite3_column_text(stmt, 0)); - file_info.file_size = - static_cast(sqlite3_column_int64(stmt, 1)); - file_info.file_mtime = - static_cast(sqlite3_column_int64(stmt, 2)); - - const char *typeflag_str = - reinterpret_cast(sqlite3_column_text(stmt, 3)); - file_info.typeflag = typeflag_str ? typeflag_str[0] : '0'; - - file_info.data_offset = - static_cast(sqlite3_column_int64(stmt, 4)); - file_info.uncompressed_offset = - static_cast(sqlite3_column_int64(stmt, 5)); - - return true; - } - - return false; -} - -std::vector query_tar_files_in_range( - const SqliteDatabase &db, int archive_id, std::uint64_t start_offset, - std::uint64_t end_offset) { - std::vector files; - - SqliteStmt stmt(db, - "SELECT file_name, file_size, file_mtime, typeflag, " - "data_offset, uncompressed_offset " - "FROM tar_files " - "WHERE archive_id = ? AND uncompressed_offset >= ? AND " - "uncompressed_offset < ? " - "ORDER BY uncompressed_offset;"); - - stmt.bind_int(1, archive_id); - stmt.bind_int64(2, static_cast(start_offset)); - stmt.bind_int64(3, static_cast(end_offset)); - - while (sqlite3_step(stmt) == SQLITE_ROW) { - TarIndexer::TarFileInfo file_info; - file_info.file_name = - reinterpret_cast(sqlite3_column_text(stmt, 0)); - file_info.file_size = - static_cast(sqlite3_column_int64(stmt, 1)); - file_info.file_mtime = - static_cast(sqlite3_column_int64(stmt, 2)); - - const char *typeflag_str = - reinterpret_cast(sqlite3_column_text(stmt, 3)); - file_info.typeflag = typeflag_str ? typeflag_str[0] : '0'; - - file_info.data_offset = - static_cast(sqlite3_column_int64(stmt, 4)); - file_info.uncompressed_offset = - static_cast(sqlite3_column_int64(stmt, 5)); - - files.push_back(file_info); - } - - return files; -} - -} // namespace dftracer::utils::utilities::indexer::internal::tar \ No newline at end of file diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp index 40b14cc3..157553ed 100644 --- a/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp +++ b/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp @@ -1,122 +1,193 @@ #include #include #include -#include -#include +#include +#include #include #include #include -#include #include #include +#include +#include +#include -#include -#include -#include +#include #include namespace dftracer::utils::utilities::indexer::internal::tar { -// Import the SQL_SCHEMA from constants -extern const char *const &SQL_SCHEMA; +using dftracer::utils::utilities::indexer::IndexDatabase; +namespace rocks = dftracer::utils::rocksdb; -// Forward declare helper functions -static dftracer::utils::coro::CoroTask build_tar_index( - const SqliteDatabase &db, int archive_id, const std::string &tar_gz_path, - std::uint64_t ckpt_size); -static void init_tar_schema(const SqliteDatabase &db); +namespace { -TarIndexer::TarIndexer(const std::string &tar_gz_file_path, - const std::string &index_path, +std::string normalize_idx_path(const std::string& path) { + fs::path input(path); + if (input.filename() == ".dftindex") { + return input.string(); + } + if (input.parent_path().filename() == ".dftindex") { + return input.parent_path().string(); + } + if (input.has_extension()) { + return (input.parent_path() / ".dftindex").string(); + } + return (input / ".dftindex").string(); +} + +dftracer::utils::coro::CoroTask build_tar_index( + IndexDatabase& db, int file_id, const std::string& tar_gz_path, + std::uint64_t ckpt_size) { + int fd = ::open(tar_gz_path.c_str(), O_RDONLY); + if (fd < 0) { + co_return false; + } + + GzipInflater inflater; + off_t offset = 0; + if (!(co_await inflater.initialize(fd))) { + ::close(fd); + co_return false; + } + + std::uint64_t total_lines = 0; + std::uint64_t total_uc_size = 0; + std::uint64_t current_uc_offset = 0; + + TarParser parser; + std::vector accumulated_data; + accumulated_data.reserve(1024 * 1024); + + while (true) { + GzipInflaterResult result; + if (!(co_await inflater.read(fd, offset, result))) { + if (result.bytes_read == 0) { + break; + } + ::close(fd); + co_return false; + } + + if (result.bytes_read == 0) { + break; + } + + accumulated_data.insert(accumulated_data.end(), inflater.out_buffer, + inflater.out_buffer + result.bytes_read); + current_uc_offset += result.bytes_read; + total_lines += result.lines_found; + } + + std::vector tar_entries; + if (!parser.parse_headers(accumulated_data.data(), accumulated_data.size(), + 0, tar_entries)) { + DFTRACER_UTILS_LOG_DEBUG("%s", "Failed to parse TAR headers"); + } + + total_uc_size = current_uc_offset; + + auto* db_ptr = &db; + auto* tar_entries_ptr = &tar_entries; + const std::string archive_name = fs::path(tar_gz_path).filename().string(); + const auto* archive_name_ptr = &archive_name; + co_await rocks::run([db_ptr, file_id, ckpt_size, total_lines, total_uc_size, + tar_entries_ptr, archive_name_ptr] { + internal::TransactionScope txn(*db_ptr); + std::uint64_t regular_files = 0; + for (const auto& entry : *tar_entries_ptr) { + if (!entry.is_regular_file()) { + continue; + } + + ++regular_files; + db_ptr->insert_tar_file( + file_id, IndexDatabase::TarFileRecord{ + .file_name = entry.name, + .file_size = entry.size, + .file_mtime = entry.mtime, + .typeflag = entry.typeflag, + .data_offset = entry.data_offset, + .uncompressed_offset = entry.uncompressed_offset, + }); + } + + db_ptr->insert_file_metadata(file_id, ckpt_size, total_lines, + total_uc_size); + db_ptr->insert_tar_archive_metadata(file_id, *archive_name_ptr, + ckpt_size, total_lines, + total_uc_size, regular_files); + txn.commit(); + }); + + ::close(fd); + co_return true; +} + +} // namespace + +TarIndexer::TarIndexer(const std::string& tar_gz_file_path, + const std::string& index_path_value, std::uint64_t checkpoint_size, bool rebuild_force) : tar_gz_path(tar_gz_file_path), - idx_path(index_path), + tar_gz_path_logical_path(get_logical_path(tar_gz_file_path)), + index_path(normalize_idx_path(index_path_value)), ckpt_size(checkpoint_size), - force_rebuild(rebuild_force), - cached_is_valid(false), - cached_archive_id(-1), - cached_max_bytes(0), - cached_num_lines(0), - cached_num_files(0), - cached_checkpoint_size(0) { + force_rebuild(rebuild_force) { open(); } TarIndexer::~TarIndexer() { - try { - DFTRACER_UTILS_LOG_DEBUG("Destroying TarIndexer for %s", - tar_gz_path.c_str()); - if (db.is_open()) { - close(); - } - DFTRACER_UTILS_LOG_DEBUG("TarIndexer destruction completed for %s", - tar_gz_path.c_str()); - } catch (const std::exception &e) { - DFTRACER_UTILS_LOG_ERROR("Error during TarIndexer destruction: %s", - e.what()); - } catch (...) { - DFTRACER_UTILS_LOG_ERROR("%s", - "Unknown error during TarIndexer destruction"); - } + DFTRACER_UTILS_LOG_DEBUG("Destroying TarIndexer for %s", + tar_gz_path.c_str()); + close(); } -TarIndexer::TarIndexer(TarIndexer &&other) noexcept +TarIndexer::TarIndexer(TarIndexer&& other) noexcept : tar_gz_path(std::move(other.tar_gz_path)), tar_gz_path_logical_path(std::move(other.tar_gz_path_logical_path)), - idx_path(std::move(other.idx_path)), + index_path(std::move(other.index_path)), ckpt_size(other.ckpt_size), force_rebuild(other.force_rebuild), - db(std::move(other.db)), - cached_is_valid(other.cached_is_valid), - cached_archive_id(other.cached_archive_id), - cached_max_bytes(other.cached_max_bytes), - cached_num_lines(other.cached_num_lines), - cached_num_files(other.cached_num_files), - cached_checkpoint_size(other.cached_checkpoint_size), + cached_is_valid(std::move(other.cached_is_valid)), + cached_archive_id(std::move(other.cached_archive_id)), + cached_max_bytes(std::move(other.cached_max_bytes)), + cached_num_lines(std::move(other.cached_num_lines)), + cached_num_files(std::move(other.cached_num_files)), + cached_checkpoint_size(std::move(other.cached_checkpoint_size)), cached_archive_name(std::move(other.cached_archive_name)), cached_checkpoints(std::move(other.cached_checkpoints)) {} -TarIndexer &TarIndexer::operator=(TarIndexer &&other) noexcept { +TarIndexer& TarIndexer::operator=(TarIndexer&& other) noexcept { if (this != &other) { tar_gz_path = std::move(other.tar_gz_path); tar_gz_path_logical_path = std::move(other.tar_gz_path_logical_path); - idx_path = std::move(other.idx_path); + index_path = std::move(other.index_path); ckpt_size = other.ckpt_size; force_rebuild = other.force_rebuild; - db = std::move(other.db); - cached_is_valid = other.cached_is_valid; - cached_archive_id = other.cached_archive_id; - cached_max_bytes = other.cached_max_bytes; - cached_num_lines = other.cached_num_lines; - cached_num_files = other.cached_num_files; - cached_checkpoint_size = other.cached_checkpoint_size; + std::scoped_lock lock(cache_mutex, other.cache_mutex); + cached_is_valid = std::move(other.cached_is_valid); + cached_archive_id = std::move(other.cached_archive_id); + cached_max_bytes = std::move(other.cached_max_bytes); + cached_num_lines = std::move(other.cached_num_lines); + cached_num_files = std::move(other.cached_num_files); + cached_checkpoint_size = std::move(other.cached_checkpoint_size); cached_archive_name = std::move(other.cached_archive_name); cached_checkpoints = std::move(other.cached_checkpoints); } return *this; } -void TarIndexer::open() { - DFTRACER_UTILS_LOG_DEBUG("Opening TAR indexer database: %s", - idx_path.c_str()); - - tar_gz_path_logical_path = get_logical_path(tar_gz_path); - - if (!db.open(idx_path)) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to open database at " + idx_path); - } -} +void TarIndexer::open() {} void TarIndexer::close() { - db.close(); - // Reset all cache - cached_is_valid = false; - cached_archive_id = -1; - cached_max_bytes = 0; - cached_num_lines = 0; - cached_num_files = 0; - cached_checkpoint_size = 0; + std::lock_guard lock(cache_mutex); + cached_is_valid.reset(); + cached_archive_id.reset(); + cached_max_bytes.reset(); + cached_num_lines.reset(); + cached_num_files.reset(); + cached_checkpoint_size.reset(); cached_archive_name.clear(); cached_checkpoints.clear(); } @@ -126,51 +197,55 @@ dftracer::utils::coro::CoroTask TarIndexer::build_async() const { co_return; } - co_await dftracer::utils::sqlite::run([&] { - init_tar_schema(db); - - int aid = find_archive_id(tar_gz_path_logical_path); - if (aid != -1) { - delete_archive_record(db, aid); - } - }); - - printf("Get modifcation time for %s\n", tar_gz_path.c_str()); - std::time_t mtime = get_file_modification_time(tar_gz_path); - printf("Calculate hash for %s\n", tar_gz_path.c_str()); - auto hash = calculate_file_hash(tar_gz_path); - printf("Get size for %s\n", tar_gz_path.c_str()); - std::uint64_t bytes = file_size_bytes(tar_gz_path); - // TODO: use determine_checkpoint_size like GZIP - std::uint64_t final_ckpt_size = ckpt_size; - - auto [file_id, archive_id] = co_await dftracer::utils::sqlite::run([&] { - int fid; - insert_file_record(db, tar_gz_path_logical_path, bytes, mtime, hash, - fid); - - std::string archive_name = fs::path(tar_gz_path).filename().string(); - int aid; - // Will update sizes later - insert_archive_record(db, fid, archive_name, 0, 0, aid); - return std::pair{fid, aid}; + IndexDatabase db(index_path); + const auto hash = calculate_file_hash(tar_gz_path); + const std::string logical = tar_gz_path_logical_path; + const auto* logical_ptr = &logical; + const int file_id = co_await rocks::run([db_ptr = &db, logical_ptr, hash] { + return db_ptr->get_or_create_file_info(*logical_ptr, hash); }); - if (!(co_await build_tar_index(db, archive_id, tar_gz_path, - final_ckpt_size))) { + if (!(co_await build_tar_index(db, file_id, tar_gz_path, ckpt_size))) { throw IndexerError(IndexerError::Type::BUILD_ERROR, "Failed to build TAR index for " + tar_gz_path); } - // Reset cache to force refresh + struct CacheSnapshot { + std::uint64_t checkpoint_size = 0; + std::uint64_t num_lines = 0; + std::uint64_t max_bytes = 0; + std::uint64_t num_files = 0; + std::string archive_name; + std::vector checkpoints; + }; + const std::string fallback_archive_name = + fs::path(tar_gz_path).filename().string(); + const auto* fallback_archive_name_ptr = &fallback_archive_name; + auto snapshot = + co_await rocks::run([db_ptr = &db, file_id, fallback_archive_name_ptr] { + CacheSnapshot cache; + cache.checkpoint_size = db_ptr->get_checkpoint_size(file_id); + cache.num_lines = db_ptr->get_num_lines(file_id); + cache.max_bytes = db_ptr->get_max_bytes(file_id); + if (auto metadata = db_ptr->query_tar_archive_metadata(file_id)) { + cache.num_files = metadata->total_files; + cache.archive_name = metadata->archive_name; + } else { + cache.archive_name = *fallback_archive_name_ptr; + } + cache.checkpoints = db_ptr->query_checkpoints(file_id); + return cache; + }); + + std::lock_guard lock(cache_mutex); cached_is_valid = true; - cached_archive_id = archive_id; - cached_max_bytes = 0; - cached_num_lines = 0; - cached_num_files = 0; - cached_checkpoint_size = final_ckpt_size; - cached_archive_name.clear(); - cached_checkpoints.clear(); + cached_archive_id = file_id; + cached_checkpoint_size = snapshot.checkpoint_size; + cached_num_lines = snapshot.num_lines; + cached_max_bytes = snapshot.max_bytes; + cached_num_files = snapshot.num_files; + cached_archive_name = std::move(snapshot.archive_name); + cached_checkpoints = std::move(snapshot.checkpoints); co_return; } @@ -180,260 +255,290 @@ bool TarIndexer::need_rebuild() const { } try { - // Check if index exists and has valid schema - if (!query_schema_validity(db)) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + const auto stored_hash = db.get_file_hash(tar_gz_path_logical_path); + if (!stored_hash.has_value()) { + return true; + } + + const int file_id = db.get_file_info_id(tar_gz_path_logical_path); + if (file_id < 0) { return true; } - // Check if file has been modified since last index - std::uint64_t stored_hash; - std::time_t stored_mtime; - if (query_stored_file_info(db, tar_gz_path_logical_path, stored_hash, - stored_mtime)) { - std::uint64_t current_hash = calculate_file_hash(tar_gz_path); - std::time_t current_mtime = get_file_modification_time(tar_gz_path); + if (db.get_checkpoint_size(file_id) == 0) { + return true; + } - return (stored_hash != current_hash || - stored_mtime != current_mtime); + if (!db.query_tar_archive_metadata(file_id).has_value()) { + return true; } + + return *stored_hash != calculate_file_hash(tar_gz_path); } catch (...) { return true; } - - return true; // If we can't determine, rebuild to be safe } bool TarIndexer::is_valid() const { - if (!cached_is_valid) { + std::lock_guard lock(cache_mutex); + if (!cached_is_valid.has_value()) { try { - bool schema_valid = query_schema_validity(db); - bool has_data = (find_archive_id(tar_gz_path_logical_path) != -1); - cached_is_valid = schema_valid && has_data; + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + const auto file_id = db.get_file_info_id(tar_gz_path_logical_path); + cached_is_valid = + file_id != -1 && + db.query_tar_archive_metadata(file_id).has_value(); } catch (...) { cached_is_valid = false; } } - return cached_is_valid; + return *cached_is_valid; } bool TarIndexer::exists() const { - return fs::exists(idx_path) && fs::is_regular_file(idx_path); + return fs::exists(index_path) && fs::is_directory(index_path); } -const std::string &TarIndexer::get_idx_path() const { return idx_path; } +const std::string& TarIndexer::get_index_path() const { return index_path; } -const std::string &TarIndexer::get_archive_path() const { return tar_gz_path; } +const std::string& TarIndexer::get_archive_path() const { return tar_gz_path; } -const std::string &TarIndexer::get_tar_gz_path() const { return tar_gz_path; } +const std::string& TarIndexer::get_tar_gz_path() const { return tar_gz_path; } -std::uint64_t TarIndexer::get_checkpoint_size() const { return ckpt_size; } +std::uint64_t TarIndexer::get_checkpoint_size() const { + { + std::lock_guard lock(cache_mutex); + if (cached_checkpoint_size.has_value()) { + return *cached_checkpoint_size; + } + } + const int file_id = get_archive_id(); + if (file_id != -1) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + const auto value = db.get_checkpoint_size(file_id); + std::lock_guard lock(cache_mutex); + cached_checkpoint_size = value; + } + std::lock_guard lock(cache_mutex); + return cached_checkpoint_size.value_or(0); +} std::uint64_t TarIndexer::get_max_bytes() const { - if (cached_max_bytes == 0) { - cached_max_bytes = query_max_bytes(db, tar_gz_path_logical_path); + { + std::lock_guard lock(cache_mutex); + if (cached_max_bytes.has_value()) { + return *cached_max_bytes; + } + } + const int file_id = get_archive_id(); + if (file_id != -1) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + const auto value = db.get_max_bytes(file_id); + std::lock_guard lock(cache_mutex); + cached_max_bytes = value; } - return cached_max_bytes; + std::lock_guard lock(cache_mutex); + return cached_max_bytes.value_or(0); } std::uint64_t TarIndexer::get_num_lines() const { - if (cached_num_lines == 0) { - cached_num_lines = query_num_lines(db, tar_gz_path_logical_path); + { + std::lock_guard lock(cache_mutex); + if (cached_num_lines.has_value()) { + return *cached_num_lines; + } + } + const int file_id = get_archive_id(); + if (file_id != -1) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + const auto value = db.get_num_lines(file_id); + std::lock_guard lock(cache_mutex); + cached_num_lines = value; } - return cached_num_lines; + std::lock_guard lock(cache_mutex); + return cached_num_lines.value_or(0); } std::uint64_t TarIndexer::get_num_files() const { - if (cached_num_files == 0) { - cached_num_files = query_num_files(db, tar_gz_path_logical_path); + { + std::lock_guard lock(cache_mutex); + if (cached_num_files.has_value()) { + return *cached_num_files; + } + } + const int file_id = get_archive_id(); + if (file_id != -1) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + std::uint64_t value = 0; + if (auto metadata = db.query_tar_archive_metadata(file_id)) { + value = metadata->total_files; + } + std::lock_guard lock(cache_mutex); + cached_num_files = value; } - return cached_num_files; + std::lock_guard lock(cache_mutex); + return cached_num_files.value_or(0); } std::string TarIndexer::get_archive_name() const { - if (cached_archive_name.empty()) { - cached_archive_name = query_archive_name(db, tar_gz_path_logical_path); - if (cached_archive_name.empty()) { - cached_archive_name = fs::path(tar_gz_path).filename().string(); + { + std::lock_guard lock(cache_mutex); + if (!cached_archive_name.empty()) { + return cached_archive_name; } } + std::string value; + const int file_id = get_archive_id(); + if (file_id != -1) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + if (auto metadata = db.query_tar_archive_metadata(file_id)) { + value = metadata->archive_name; + } + } + if (value.empty()) { + value = fs::path(tar_gz_path).filename().string(); + } + std::lock_guard lock(cache_mutex); + cached_archive_name = value; return cached_archive_name; } int TarIndexer::get_archive_id() const { - if (cached_archive_id == -1) { + std::lock_guard lock(cache_mutex); + if (!cached_archive_id.has_value()) { cached_archive_id = find_archive_id(tar_gz_path_logical_path); } - return cached_archive_id; + return *cached_archive_id; } -int TarIndexer::find_archive_id(const std::string &tar_gz_file_path) const { - return query_archive_id(db, tar_gz_file_path); +int TarIndexer::find_archive_id(const std::string& tar_gz_file_path) const { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + return db.get_file_info_id(tar_gz_file_path); } bool TarIndexer::find_checkpoint(std::size_t target_offset, - IndexerCheckpoint &checkpoint) const { - int archive_id = get_archive_id(); - if (archive_id == -1) return false; - return query_tar_checkpoint(db, target_offset, archive_id, checkpoint); + IndexerCheckpoint& checkpoint) const { + const int archive_id = get_archive_id(); + if (archive_id == -1) { + return false; + } + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + return db.find_checkpoint(archive_id, target_offset, checkpoint); } std::vector TarIndexer::get_checkpoints() const { - if (cached_checkpoints.empty()) { - int archive_id = get_archive_id(); - if (archive_id != -1) { - cached_checkpoints = query_tar_checkpoints(db, archive_id); + { + std::lock_guard lock(cache_mutex); + if (!cached_checkpoints.empty()) { + return cached_checkpoints; } } + const int archive_id = get_archive_id(); + if (archive_id != -1) { + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto checkpoints = db.query_checkpoints(archive_id); + std::lock_guard lock(cache_mutex); + cached_checkpoints = std::move(checkpoints); + } + std::lock_guard lock(cache_mutex); return cached_checkpoints; } std::vector TarIndexer::get_checkpoints_for_line_range( std::uint64_t start_line, std::uint64_t end_line) const { - int archive_id = get_archive_id(); - if (archive_id == -1) return {}; - return query_tar_checkpoints_for_line_range(db, archive_id, start_line, - end_line); + const int archive_id = get_archive_id(); + if (archive_id == -1) { + return {}; + } + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + return db.query_checkpoints_for_line_range(archive_id, start_line, + end_line); } std::vector TarIndexer::list_files() const { - int archive_id = get_archive_id(); - if (archive_id == -1) return {}; + const int archive_id = get_archive_id(); + if (archive_id == -1) { + return {}; + } - auto tar_files = query_tar_files(db, archive_id); + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + auto tar_files = db.query_tar_files(archive_id); std::vector result; result.reserve(tar_files.size()); - - for (const auto &tf : tar_files) { - result.emplace_back( - TarFileInfo{tf.file_name, tf.file_size, tf.file_mtime, tf.typeflag, - tf.data_offset, tf.uncompressed_offset}); + for (const auto& tf : tar_files) { + result.push_back(TarFileInfo{tf.file_name, tf.file_size, tf.file_mtime, + tf.typeflag, tf.data_offset, + tf.uncompressed_offset}); } - return result; } -bool TarIndexer::find_file(const std::string &file_name, - TarFileInfo &file_info) const { - int archive_id = get_archive_id(); - if (archive_id == -1) return false; +bool TarIndexer::find_file(const std::string& file_name, + TarFileInfo& file_info) const { + const int archive_id = get_archive_id(); + if (archive_id == -1) { + return false; + } + + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); + IndexDatabase::TarFileRecord record; + if (!db.find_tar_file(archive_id, file_name, record)) { + return false; + } - return query_tar_file(db, archive_id, file_name, file_info); + file_info = TarFileInfo{record.file_name, record.file_size, + record.file_mtime, record.typeflag, + record.data_offset, record.uncompressed_offset}; + return true; } std::vector TarIndexer::find_files_in_range( std::uint64_t start_offset, std::uint64_t end_offset) const { - int archive_id = get_archive_id(); - if (archive_id == -1) return {}; + const int archive_id = get_archive_id(); + if (archive_id == -1) { + return {}; + } + IndexDatabase db( + index_path, + dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly); auto tar_files = - query_tar_files_in_range(db, archive_id, start_offset, end_offset); + db.query_tar_files_in_range(archive_id, start_offset, end_offset); std::vector result; result.reserve(tar_files.size()); - - for (const auto &tf : tar_files) { - result.emplace_back( - TarFileInfo{tf.file_name, tf.file_size, tf.file_mtime, tf.typeflag, - tf.data_offset, tf.uncompressed_offset}); + for (const auto& tf : tar_files) { + result.push_back(TarFileInfo{tf.file_name, tf.file_size, tf.file_mtime, + tf.typeflag, tf.data_offset, + tf.uncompressed_offset}); } - return result; } -// Include the helper functions from the impl file -static void init_tar_schema(const SqliteDatabase &db) { - DFTRACER_UTILS_LOG_DEBUG("%s", "Initializing TAR indexer schema"); - int rc = sqlite3_exec(db.get(), SQL_SCHEMA, NULL, NULL, NULL); - if (rc != SQLITE_OK) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to initialize TAR schema: " + - std::string(sqlite3_errmsg(db.get()))); - } -} - -static dftracer::utils::coro::CoroTask build_tar_index( - const SqliteDatabase &db, int archive_id, const std::string &tar_gz_path, - std::uint64_t ckpt_size) { - int fd = ::open(tar_gz_path.c_str(), O_RDONLY); - if (fd < 0) { - co_return false; - } - - GzipInflater inflater; - off_t offset = 0; - if (!(co_await inflater.initialize(fd))) { - ::close(fd); - co_return false; - } - - std::uint64_t total_lines = 0; - std::uint64_t total_uc_size = 0; - std::uint64_t current_uc_offset = 0; - - // Parse TAR format and extract file entries - TarParser parser; - std::vector accumulated_data; - accumulated_data.reserve(1024 * 1024); // Pre-allocate 1MB - - while (true) { - // std::size_t chunk_start_uc = current_uc_offset; - // std::size_t chunk_start_c = inflater.get_total_input_consumed(); - - GzipInflaterResult result; - if (!(co_await inflater.read(fd, offset, result))) { - if (result.bytes_read == 0) { - break; // EOF - } - ::close(fd); - co_return false; // Error - } - - if (result.bytes_read == 0) { - break; // EOF - } - - // Accumulate data for TAR parsing - accumulated_data.insert(accumulated_data.end(), inflater.out_buffer, - inflater.out_buffer + result.bytes_read); - - current_uc_offset += result.bytes_read; - total_lines += result.lines_found; - } - - // Parse TAR entries from accumulated data - std::vector tar_entries; - if (!parser.parse_headers(accumulated_data.data(), accumulated_data.size(), - 0, tar_entries)) { - DFTRACER_UTILS_LOG_DEBUG( - "%s", "Failed to parse TAR headers from accumulated data"); - // Continue anyway - might be a malformed TAR or not actually TAR.GZ - } - - // Insert TAR file entries and metadata into database - total_uc_size = current_uc_offset; - co_await dftracer::utils::sqlite::run([&] { - for (const auto &entry : tar_entries) { - if (entry.is_regular_file()) { - InsertTarFileData file_data; - file_data.file_name = entry.name; - file_data.file_size = entry.size; - file_data.file_mtime = entry.mtime; - file_data.typeflag = entry.typeflag; - file_data.data_offset = entry.data_offset; - file_data.uncompressed_offset = entry.uncompressed_offset; - - insert_tar_file_record(db, archive_id, file_data); - } - } - - DFTRACER_UTILS_LOG_DEBUG("Parsed %zu TAR file entries", - tar_entries.size()); - - insert_archive_metadata_record(db, archive_id, ckpt_size, total_lines, - total_uc_size); - }); - - ::close(fd); - co_return true; -} - } // namespace dftracer::utils::utilities::indexer::internal::tar diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.h b/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.h index 5fb79072..7889a2b1 100644 --- a/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.h +++ b/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.h @@ -4,25 +4,25 @@ #include #include #include -#include +#include #include #include #include #include +#include +#include #include #include namespace dftracer::utils::utilities::indexer::internal::tar { -using dftracer::utils::sqlite::SqliteDatabase; - class TarIndexer : public Indexer { public: static constexpr std::uint64_t DEFAULT_CHECKPOINT_SIZE = constants::indexer::DEFAULT_CHECKPOINT_SIZE; - TarIndexer(const std::string &tar_gz_path, const std::string &idx_path, + TarIndexer(const std::string &tar_gz_path, const std::string &index_path, std::uint64_t checkpoint_size = DEFAULT_CHECKPOINT_SIZE, bool force = false); ~TarIndexer(); @@ -35,7 +35,7 @@ class TarIndexer : public Indexer { bool need_rebuild() const override; bool exists() const override; - const std::string &get_idx_path() const override; + const std::string &get_index_path() const override; const std::string &get_archive_path() const override; const std::string &get_tar_gz_path() const; std::uint64_t get_checkpoint_size() const override; @@ -80,20 +80,20 @@ class TarIndexer : public Indexer { private: std::string tar_gz_path; std::string tar_gz_path_logical_path; - std::string idx_path; + std::string index_path; std::uint64_t ckpt_size; bool force_rebuild; - SqliteDatabase db; // Cached values - mutable bool cached_is_valid; - mutable int cached_archive_id; - mutable std::uint64_t cached_max_bytes; - mutable std::uint64_t cached_num_lines; - mutable std::uint64_t cached_num_files; - mutable std::uint64_t cached_checkpoint_size; + mutable std::optional cached_is_valid; + mutable std::optional cached_archive_id; + mutable std::optional cached_max_bytes; + mutable std::optional cached_num_lines; + mutable std::optional cached_num_files; + mutable std::optional cached_checkpoint_size; mutable std::string cached_archive_name; mutable std::vector cached_checkpoints; + mutable std::mutex cache_mutex; // Internal methods void open(); diff --git a/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h b/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h new file mode 100644 index 00000000..b23a23e1 --- /dev/null +++ b/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h @@ -0,0 +1,39 @@ +#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TRANSACTION_SCOPE_H +#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TRANSACTION_SCOPE_H + +namespace dftracer::utils::utilities::indexer::internal { + +template +class TransactionScope { + public: + explicit TransactionScope(Database& db) : db_(db) { + db_.begin_transaction(); + } + + TransactionScope(const TransactionScope&) = delete; + TransactionScope& operator=(const TransactionScope&) = delete; + + TransactionScope(TransactionScope&& other) noexcept + : db_(other.db_), committed_(other.committed_) { + other.committed_ = true; + } + + ~TransactionScope() { + if (!committed_) { + db_.rollback_transaction(); + } + } + + void commit() { + db_.commit_transaction(); + committed_ = true; + } + + private: + Database& db_; + bool committed_ = false; +}; + +} // namespace dftracer::utils::utilities::indexer::internal + +#endif // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TRANSACTION_SCOPE_H diff --git a/src/dftracer/utils/utilities/indexer/provenance_database.cpp b/src/dftracer/utils/utilities/indexer/provenance_database.cpp index ca65ef2f..4896a54e 100644 --- a/src/dftracer/utils/utilities/indexer/provenance_database.cpp +++ b/src/dftracer/utils/utilities/indexer/provenance_database.cpp @@ -1,206 +1,426 @@ #include -#include -#include +#include #include +#include +#include #include +#include +#include + namespace dftracer::utils::utilities::indexer { -namespace queries = composites::dft::indexing::queries; +namespace rocks = dftracer::utils::rocksdb; -using dftracer::utils::sqlite::SqliteStmt; using internal::IndexerError; -static const char* PROVENANCE_SCHEMA = R"( - PRAGMA journal_mode=WAL; - PRAGMA busy_timeout=5000; - PRAGMA foreign_keys=ON; - - CREATE TABLE IF NOT EXISTS file_info ( - id INTEGER PRIMARY KEY, - path TEXT NOT NULL, - hash INTEGER - ); - - CREATE TABLE IF NOT EXISTS provenance_info ( - key TEXT PRIMARY KEY, - value TEXT - ); - - CREATE TABLE IF NOT EXISTS provenance_sources ( - source_idx INTEGER PRIMARY KEY, - file_info_id INTEGER NOT NULL DEFAULT 0, - path TEXT NOT NULL, - num_checkpoints INTEGER, - event_hash TEXT NOT NULL DEFAULT '' - ); - - CREATE TABLE IF NOT EXISTS provenance_group ( - id INTEGER PRIMARY KEY, - name TEXT, - predicate TEXT - ); - - CREATE TABLE IF NOT EXISTS provenance_segments ( - source_idx INTEGER, - source_checkpoint INTEGER, - output_line_start INTEGER, - output_line_end INTEGER, - event_count INTEGER - ); -)"; - -ProvenanceDatabase::ProvenanceDatabase(const std::string& pidx_path) - : db_(pidx_path) {} - -void ProvenanceDatabase::init_schema() { - char* err_msg = nullptr; - int rc = - sqlite3_exec(db_.get(), PROVENANCE_SCHEMA, nullptr, nullptr, &err_msg); - if (rc != SQLITE_OK) { - std::string error = - err_msg ? std::string(err_msg) : "Unknown schema error"; - if (err_msg) sqlite3_free(err_msg); - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to initialize provenance schema: " + error); +namespace { + +[[noreturn]] void throw_db_error(std::string_view message, + const ::rocksdb::Status& status) { + throw IndexerError(IndexerError::Type::DATABASE_ERROR, + std::string(message) + ": " + status.ToString()); +} + +std::string file_key(std::string_view path) { + return std::string("pf|") + std::string(path); +} + +std::string file_reverse_key(int file_info_id) { + std::string key("pr|"); + rocks::KeyCodec::append_be32(key, static_cast(file_info_id)); + return key; +} + +std::string next_file_id_key() { return "_next_prov_file_id"; } + +std::string encode_file_record(int file_info_id, std::uint64_t file_hash) { + std::string value; + rocks::KeyCodec::append_be32(value, + static_cast(file_info_id)); + rocks::KeyCodec::append_be64(value, file_hash); + return value; +} + +int decode_file_id(std::string_view value) { + if (value.size() < 4) { + throw std::runtime_error("Corrupt provenance file record"); + } + return static_cast(rocks::KeyCodec::decode_be32(value.substr(0, 4))); +} + +std::uint64_t decode_hash(std::string_view value) { + if (value.size() < 12) { + throw std::runtime_error("Corrupt provenance file record"); + } + return rocks::KeyCodec::decode_be64(value.substr(4, 8)); +} + +std::string source_key(int file_info_id, int source_idx) { + std::string key("ps|"); + rocks::KeyCodec::append_be32(key, static_cast(file_info_id)); + rocks::KeyCodec::append_be32(key, static_cast(source_idx)); + return key; +} + +std::string info_key(int file_info_id, std::string_view key_suffix) { + std::string key("pi|"); + rocks::KeyCodec::append_be32(key, static_cast(file_info_id)); + key.append(key_suffix); + return key; +} + +std::string group_prefix(int file_info_id) { + std::string key("pg|"); + rocks::KeyCodec::append_be32(key, static_cast(file_info_id)); + return key; +} + +std::string group_key(int file_info_id, std::string_view name) { + auto key = group_prefix(file_info_id); + key.append(name); + return key; +} + +std::string segment_key(int file_info_id, int source_idx, + int source_checkpoint) { + std::string key("px|"); + rocks::KeyCodec::append_be32(key, static_cast(file_info_id)); + rocks::KeyCodec::append_be32(key, static_cast(source_idx)); + rocks::KeyCodec::append_be32(key, + static_cast(source_checkpoint)); + return key; +} + +void append_string(std::string& out, std::string_view value) { + rocks::KeyCodec::append_be32(out, static_cast(value.size())); + out.append(value.data(), value.size()); +} + +void append_u32(std::string& out, std::uint32_t value) { + rocks::KeyCodec::append_be32(out, value); +} + +class Cursor { + public: + explicit Cursor(std::string_view data) : data_(data) {} + + std::uint32_t u32() { + auto part = take(4); + return rocks::KeyCodec::decode_be32(part); + } + + std::string str() { + const auto len = static_cast(u32()); + auto bytes = take(len); + return std::string(bytes.data(), bytes.size()); + } + + private: + std::string_view take(std::size_t len) { + if (offset_ + len > data_.size()) { + throw std::runtime_error("Corrupt provenance payload"); + } + auto part = data_.substr(offset_, len); + offset_ += len; + return part; } + + std::string_view data_; + std::size_t offset_ = 0; +}; + +template +void scan_prefix(const rocks::RocksDatabase& db, std::string_view prefix, + Fn&& fn) { + internal::scan_prefix_iterator( + "Failed to scan provenance prefix", prefix, + [&] { return db.new_iterator("provenance"); }, std::forward(fn)); } +} // namespace + +ProvenanceDatabase::ProvenanceDatabase(const std::string& provenance_path, + rocks::RocksDatabase::OpenMode open_mode) + : db_path_(internal::normalize_index_root(provenance_path)), + open_mode_(open_mode), + db_(rocks::RocksDBManager::instance().get_or_open(db_path_, open_mode_)) { + if (open_mode_ == rocks::RocksDatabase::OpenMode::ReadWrite) { + init_schema(); + } +} + +void ProvenanceDatabase::init_schema() {} + int ProvenanceDatabase::get_or_create_file_info(const std::string& path, std::uint64_t file_hash) { - { - SqliteStmt stmt(db_, "SELECT id, hash FROM file_info WHERE path = ?;"); - stmt.bind_text(1, path); - int rc = sqlite3_step(stmt); - if (rc == SQLITE_ROW) { - int id = sqlite3_column_int(stmt, 0); - auto stored_hash = - static_cast(sqlite3_column_int64(stmt, 1)); - if (stored_hash == file_hash) { - return id; - } - SqliteStmt del(db_, "DELETE FROM file_info WHERE id = ?;"); - del.bind_int(1, id); - sqlite3_step(del); + const auto key = file_key(path); + std::string value; + auto status = db_->get(key, &value, "provenance"); + if (status.ok()) { + const auto id = decode_file_id(value); + if (decode_hash(value) == file_hash) { + return id; } + const auto encoded = encode_file_record(id, file_hash); + status = txn_batch_ ? db_->put(*txn_batch_, "provenance", key, encoded) + : db_->put(key, encoded, "provenance"); + if (!status.ok()) { + throw_db_error("Failed to update provenance file info", status); + } + status = txn_batch_ + ? db_->put(*txn_batch_, "provenance", file_reverse_key(id), + path) + : db_->put(file_reverse_key(id), path, "provenance"); + if (!status.ok()) { + throw_db_error("Failed to update provenance reverse file info", + status); + } + return id; + } + if (!status.IsNotFound()) { + throw_db_error("Failed to query provenance file info", status); + } + + std::uint32_t next_id = 1; + std::string next_value; + status = db_->get(next_file_id_key(), &next_value, "provenance"); + if (status.ok()) { + next_id = rocks::KeyCodec::decode_be32(next_value); + } else if (!status.IsNotFound()) { + throw_db_error("Failed to read next provenance file id", status); } - SqliteStmt stmt(db_, "INSERT INTO file_info(path, hash) VALUES(?, ?);"); - stmt.bind_text(1, path); - stmt.bind_int64(2, static_cast(file_hash)); - int rc = sqlite3_step(stmt); - if (rc != SQLITE_DONE) { - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to insert file_info: " + - std::string(sqlite3_errmsg(db_.get()))); + const auto encoded = + encode_file_record(static_cast(next_id), file_hash); + const auto next_encoded = rocks::KeyCodec::encode_be32(next_id + 1); + if (txn_batch_) { + status = db_->put(*txn_batch_, "provenance", key, encoded); + if (!status.ok()) throw_db_error("Failed to insert file info", status); + status = db_->put(*txn_batch_, "provenance", file_reverse_key(next_id), + path); + if (!status.ok()) { + throw_db_error("Failed to insert reverse file info", status); + } + status = db_->put(*txn_batch_, "provenance", next_file_id_key(), + next_encoded); + if (!status.ok()) { + throw_db_error("Failed to update next provenance file id", status); + } + } else { + status = db_->put(key, encoded, "provenance"); + if (!status.ok()) throw_db_error("Failed to insert file info", status); + status = db_->put(file_reverse_key(next_id), path, "provenance"); + if (!status.ok()) { + throw_db_error("Failed to insert reverse file info", status); + } + status = db_->put(next_file_id_key(), next_encoded, "provenance"); + if (!status.ok()) { + throw_db_error("Failed to update next provenance file id", status); + } } - return static_cast(sqlite3_last_insert_rowid(db_.get())); + return static_cast(next_id); } int ProvenanceDatabase::get_file_info_id(const std::string& path) const { - SqliteStmt stmt(db_, "SELECT id FROM file_info WHERE path = ?;"); - stmt.bind_text(1, path); - int rc = sqlite3_step(stmt); - if (rc == SQLITE_ROW) { - return sqlite3_column_int(stmt, 0); + std::string value; + auto status = db_->get(file_key(path), &value, "provenance"); + if (status.IsNotFound()) { + return -1; } - return -1; + if (!status.ok()) { + throw_db_error("Failed to read provenance file info id", status); + } + return decode_file_id(value); } void ProvenanceDatabase::begin_transaction() { - char* err_msg = nullptr; - int rc = sqlite3_exec(db_.get(), "BEGIN TRANSACTION;", nullptr, nullptr, - &err_msg); - if (rc != SQLITE_OK) { - std::string error = err_msg ? std::string(err_msg) : "Unknown error"; - if (err_msg) sqlite3_free(err_msg); - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to begin transaction: " + error); - } + txn_batch_ = + std::make_unique(db_->begin_batch()); } void ProvenanceDatabase::commit_transaction() { - char* err_msg = nullptr; - int rc = sqlite3_exec(db_.get(), "COMMIT;", nullptr, nullptr, &err_msg); - if (rc != SQLITE_OK) { - std::string error = err_msg ? std::string(err_msg) : "Unknown error"; - if (err_msg) sqlite3_free(err_msg); - throw IndexerError(IndexerError::Type::DATABASE_ERROR, - "Failed to commit transaction: " + error); + if (!txn_batch_) { + return; + } + auto status = db_->commit_batch(*txn_batch_); + txn_batch_.reset(); + if (!status.ok()) { + throw_db_error("Failed to commit provenance RocksDB batch", status); } } +void ProvenanceDatabase::rollback_transaction() noexcept { txn_batch_.reset(); } + std::string determine_provenance_index_path(const std::string& data_path, const std::string& index_dir) { - fs::path p(data_path); - std::string filename = p.filename().string() + ".pidx"; - - if (!index_dir.empty()) { - return (fs::path(index_dir) / filename).string(); - } - - return (data_path + ".pidx"); + fs::path path = index_dir.empty() ? fs::path(data_path).parent_path() + : fs::path(index_dir); + return internal::normalize_index_root((path / ".dftindex").string()); } -// --------------------------------------------------------------------------- -// Provenance insert operations -// --------------------------------------------------------------------------- - -void ProvenanceDatabase::insert_info(std::string_view key, +void ProvenanceDatabase::insert_info(int file_info_id, std::string_view key, std::string_view value) { - queries::insert_provenance_info(db_, key, value); + const auto db_key = info_key(file_info_id, key); + auto status = txn_batch_ + ? db_->put(*txn_batch_, "provenance", db_key, value) + : db_->put(db_key, value, "provenance"); + if (!status.ok()) { + throw_db_error("Failed to insert provenance info", status); + } } void ProvenanceDatabase::insert_source(int file_info_id, int source_idx, std::string_view path, int num_checkpoints, std::string_view event_hash) { - queries::insert_provenance_source(db_, file_info_id, source_idx, path, - num_checkpoints, event_hash); + std::string value; + append_string(value, path); + append_u32(value, static_cast(num_checkpoints)); + append_string(value, event_hash); + auto status = txn_batch_ + ? db_->put(*txn_batch_, "provenance", + source_key(file_info_id, source_idx), value) + : db_->put(source_key(file_info_id, source_idx), value, + "provenance"); + if (!status.ok()) { + throw_db_error("Failed to insert provenance source", status); + } } -void ProvenanceDatabase::insert_group(std::string_view name, +void ProvenanceDatabase::insert_group(int file_info_id, std::string_view name, std::string_view predicate) { - queries::insert_provenance_group(db_, name, predicate); + const auto db_key = group_key(file_info_id, name); + auto status = txn_batch_ + ? db_->put(*txn_batch_, "provenance", db_key, + std::string(predicate)) + : db_->put(db_key, std::string(predicate), "provenance"); + if (!status.ok()) { + throw_db_error("Failed to insert provenance group", status); + } } -void ProvenanceDatabase::insert_segment(int source_idx, int source_checkpoint, +void ProvenanceDatabase::insert_segment(int file_info_id, int source_idx, + int source_checkpoint, int output_line_start, int output_line_end, int event_count) { - queries::insert_provenance_segment(db_, source_idx, source_checkpoint, - output_line_start, output_line_end, - event_count); + std::string value; + append_u32(value, static_cast(output_line_start)); + append_u32(value, static_cast(output_line_end)); + append_u32(value, static_cast(event_count)); + auto status = + txn_batch_ + ? db_->put(*txn_batch_, "provenance", + segment_key(file_info_id, source_idx, source_checkpoint), + value) + : db_->put(segment_key(file_info_id, source_idx, source_checkpoint), + value, "provenance"); + if (!status.ok()) { + throw_db_error("Failed to insert provenance segment", status); + } } -// --------------------------------------------------------------------------- -// Provenance query operations -// --------------------------------------------------------------------------- - std::vector ProvenanceDatabase::query_sources(int file_info_id) const { - return queries::query_provenance_sources(db_, file_info_id); + std::vector results; + std::string prefix("ps|"); + rocks::KeyCodec::append_be32(prefix, + static_cast(file_info_id)); + scan_prefix(*db_, prefix, [&](::rocksdb::Iterator& it) { + const auto key = std::string(it.key().data(), it.key().size()); + const auto value = std::string(it.value().data(), it.value().size()); + ProvenanceSource source; + source.source_idx = static_cast( + rocks::KeyCodec::decode_be32(std::string_view(key).substr(7, 4))); + Cursor cursor(value); + source.path = cursor.str(); + source.num_checkpoints = static_cast(cursor.u32()); + source.event_hash = cursor.str(); + results.push_back(std::move(source)); + }); + return results; } std::vector -ProvenanceDatabase::query_segments(int source_idx) const { - return queries::query_provenance_segments(db_, source_idx); +ProvenanceDatabase::query_segments(int file_info_id, int source_idx) const { + std::vector results; + std::string prefix("px|"); + rocks::KeyCodec::append_be32(prefix, + static_cast(file_info_id)); + rocks::KeyCodec::append_be32(prefix, + static_cast(source_idx)); + scan_prefix(*db_, prefix, [&](::rocksdb::Iterator& it) { + const auto key = std::string(it.key().data(), it.key().size()); + const auto value = std::string(it.value().data(), it.value().size()); + Cursor cursor(value); + ProvenanceSegment segment; + segment.source_idx = source_idx; + segment.source_checkpoint = static_cast( + rocks::KeyCodec::decode_be32(std::string_view(key).substr(11, 4))); + segment.output_line_start = static_cast(cursor.u32()); + segment.output_line_end = static_cast(cursor.u32()); + segment.event_count = static_cast(cursor.u32()); + results.push_back(std::move(segment)); + }); + return results; } std::vector -ProvenanceDatabase::query_all_segments() const { - return queries::query_all_provenance_segments(db_); +ProvenanceDatabase::query_all_segments(int file_info_id) const { + std::vector results; + std::string prefix("px|"); + rocks::KeyCodec::append_be32(prefix, + static_cast(file_info_id)); + scan_prefix(*db_, prefix, [&](::rocksdb::Iterator& it) { + const auto key = std::string(it.key().data(), it.key().size()); + const auto value = std::string(it.value().data(), it.value().size()); + Cursor cursor(value); + ProvenanceSegment segment; + segment.source_idx = static_cast( + rocks::KeyCodec::decode_be32(std::string_view(key).substr(7, 4))); + segment.source_checkpoint = static_cast( + rocks::KeyCodec::decode_be32(std::string_view(key).substr(11, 4))); + segment.output_line_start = static_cast(cursor.u32()); + segment.output_line_end = static_cast(cursor.u32()); + segment.event_count = static_cast(cursor.u32()); + results.push_back(std::move(segment)); + }); + return results; } -std::string ProvenanceDatabase::query_info(std::string_view key) const { - return queries::query_provenance_info(db_, key); +std::string ProvenanceDatabase::query_info(int file_info_id, + std::string_view key) const { + std::string value; + auto status = db_->get(info_key(file_info_id, key), &value, "provenance"); + if (status.IsNotFound()) { + return {}; + } + if (!status.ok()) { + throw_db_error("Failed to query provenance info", status); + } + return value; } -std::string ProvenanceDatabase::query_group_name() const { - return queries::query_provenance_group_name(db_); +std::string ProvenanceDatabase::query_group_name(int file_info_id) const { + std::string result; + const auto prefix = group_prefix(file_info_id); + scan_prefix(*db_, prefix, [&](::rocksdb::Iterator& it) { + if (result.empty()) { + const auto key = std::string(it.key().data(), it.key().size()); + result = key.substr(prefix.size()); + } + }); + return result; } -std::string ProvenanceDatabase::query_group_predicate() const { - return queries::query_provenance_group_predicate(db_); +std::string ProvenanceDatabase::query_group_predicate(int file_info_id) const { + std::string result; + scan_prefix(*db_, group_prefix(file_info_id), [&](::rocksdb::Iterator& it) { + if (result.empty()) { + result = std::string(it.value().data(), it.value().size()); + } + }); + return result; } } // namespace dftracer::utils::utilities::indexer diff --git a/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp b/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp index e01d511e..931d8188 100644 --- a/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp +++ b/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include @@ -14,9 +13,6 @@ using dftracer::utils::utilities::common::json::JsonValue; using dftracer::utils::utilities::composites::dft::DFTracerEvent; using dftracer::utils::utilities::composites::dft::indexing::BloomFilter; -namespace queries = - dftracer::utils::utilities::composites::dft::indexing::queries; - namespace dftracer::utils::utilities::indexer { namespace { @@ -187,18 +183,12 @@ void BloomVisitor::on_line(std::string_view line, std::size_t checkpoint_idx) { } void BloomVisitor::finalize(IndexDatabase& db, int file_id) { - auto& sql_db = db.sql_db(); - std::unordered_map file_blooms; for (const auto& dim : dimensions_) { file_blooms.emplace(dim, BloomFilter(config_.expected_entries_per_chunk, config_.false_positive_rate)); } - auto bloom_stmt = queries::prepare_insert_chunk_bloom_filter(sql_db); - auto dim_stats_stmt = queries::prepare_insert_chunk_dimension_stats(sql_db); - auto hash_stmt = queries::prepare_insert_hash_resolution(sql_db); - std::vector blob; for (std::size_t i = 0; i < chunks_.size(); ++i) { @@ -211,27 +201,24 @@ void BloomVisitor::finalize(IndexDatabase& db, int file_id) { const BloomFilter& bf = it->second; bf.serialize_into(blob); - queries::insert_chunk_bloom_filter( - bloom_stmt, file_id, checkpoint_idx, dim, blob.data(), - static_cast(blob.size()), + db.insert_chunk_bloom_filter( + file_id, checkpoint_idx, dim, + std::span(blob.data(), blob.size()), static_cast(bf.num_entries())); file_blooms.at(dim).merge_from(bf); } - queries::insert_chunk_statistics(sql_db, file_id, checkpoint_idx, - chunk.statistics); + db.insert_chunk_statistics(file_id, checkpoint_idx, chunk.statistics); for (const auto& [dim, ds] : chunk.dimension_stats) { - queries::insert_chunk_dimension_stats(dim_stats_stmt, file_id, - checkpoint_idx, ds, - config_.value_counts_cap); + db.insert_chunk_dimension_stats(file_id, checkpoint_idx, ds, + config_.value_counts_cap); } for (const auto& [dim, resolutions] : chunk.hash_resolutions) { for (const auto& [hash_val, resolved] : resolutions) { - queries::insert_hash_resolution(hash_stmt, file_id, dim, - hash_val, resolved); + db.insert_hash_resolution(file_id, dim, hash_val, resolved); } } } @@ -239,13 +226,14 @@ void BloomVisitor::finalize(IndexDatabase& db, int file_id) { for (const auto& dim : dimensions_) { const BloomFilter& bf = file_blooms.at(dim); bf.serialize_into(blob); - queries::insert_file_bloom_filter( - sql_db, file_id, dim, blob.data(), static_cast(blob.size()), + db.insert_file_bloom_filter( + file_id, dim, + std::span(blob.data(), blob.size()), static_cast(bf.num_entries())); } for (const auto& dim : dimensions_) { - queries::insert_index_dimension(sql_db, file_id, dim); + db.insert_index_dimension(file_id, dim); } } diff --git a/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp b/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp index 137c3651..ec388b51 100644 --- a/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp +++ b/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -58,38 +57,15 @@ void ManifestVisitor::on_line(std::string_view line, } void ManifestVisitor::finalize(IndexDatabase& db, int file_id) { - using dftracer::utils::sqlite::SqliteStmt; - auto* raw = db.db(); - for (std::size_t ci = 0; ci < event_lines_.size(); ++ci) { for (auto& [key, lines] : event_lines_[ci]) { - auto packed = queries::pack_line_numbers(lines); - SqliteStmt stmt(raw, - "INSERT INTO checkpoint_event_ranges" - "(checkpoint_idx,file_info_id,cat,name," - "line_numbers,event_count)" - " VALUES(?,?,?,?,?,?);"); - stmt.bind_int64(1, static_cast(ci)); - stmt.bind_int(2, file_id); - stmt.bind_text(3, key.first); - stmt.bind_text(4, key.second); - stmt.bind_blob(5, packed.data(), static_cast(packed.size())); - stmt.bind_int64(6, static_cast(lines.size())); - sqlite3_step(stmt.get()); + db.insert_event_range(file_id, static_cast(ci), + key.first, key.second, lines); } for (auto& [meta_type, lines] : metadata_lines_[ci]) { - auto packed = queries::pack_line_numbers(lines); - SqliteStmt stmt( - raw, - "INSERT INTO checkpoint_metadata_lines" - "(checkpoint_idx,file_info_id,meta_type,line_numbers)" - " VALUES(?,?,?,?);"); - stmt.bind_int64(1, static_cast(ci)); - stmt.bind_int(2, file_id); - stmt.bind_text(3, meta_type); - stmt.bind_blob(4, packed.data(), static_cast(packed.size())); - sqlite3_step(stmt.get()); + db.insert_metadata_lines(file_id, static_cast(ci), + meta_type, lines); } } } diff --git a/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp b/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp index 9dd1f91f..86449551 100644 --- a/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp +++ b/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp @@ -62,18 +62,18 @@ GzipReader::GzipReader(const std::string &gz_path_, const std::string &idx_path_, std::size_t index_ckpt_size) : gz_path(gz_path_), - idx_path(idx_path_), + index_path(idx_path_), is_open(false), default_buffer_size(DEFAULT_READER_BUFFER_SIZE), indexer(nullptr) { try { indexer = dftracer::utils::utilities::indexer::internal:: - IndexerFactory::create(gz_path, idx_path, index_ckpt_size, false); + IndexerFactory::create(gz_path, index_path, index_ckpt_size, false); is_open = true; DFTRACER_UTILS_LOG_DEBUG( "Successfully created GZIP reader for gz: %s and index: %s", - gz_path.c_str(), idx_path.c_str()); + gz_path.c_str(), index_path.c_str()); } catch (const std::exception &e) { throw ReaderError(ReaderError::INITIALIZATION_ERROR, "Failed to initialize reader with indexer: " + @@ -92,19 +92,19 @@ GzipReader::GzipReader( } is_open = true; gz_path = indexer->get_archive_path(); - idx_path = indexer->get_idx_path(); + index_path = indexer->get_index_path(); } GzipReader::~GzipReader() { DFTRACER_UTILS_LOG_DEBUG("Destroying GZIP reader for gz: %s and index: %s", - gz_path.c_str(), idx_path.c_str()); + gz_path.c_str(), index_path.c_str()); reset(); is_open = false; } GzipReader::GzipReader(GzipReader &&other) noexcept : gz_path(std::move(other.gz_path)), - idx_path(std::move(other.idx_path)), + index_path(std::move(other.index_path)), is_open(other.is_open), default_buffer_size(other.default_buffer_size), indexer(std::move(other.indexer)) { @@ -114,7 +114,7 @@ GzipReader::GzipReader(GzipReader &&other) noexcept GzipReader &GzipReader::operator=(GzipReader &&other) noexcept { if (this != &other) { gz_path = std::move(other.gz_path); - idx_path = std::move(other.idx_path); + index_path = std::move(other.index_path); is_open = other.is_open; default_buffer_size = other.default_buffer_size; indexer = std::move(other.indexer); @@ -140,7 +140,7 @@ std::size_t GzipReader::get_num_lines() const { const std::string &GzipReader::get_archive_path() const { return gz_path; } -const std::string &GzipReader::get_idx_path() const { return idx_path; } +const std::string &GzipReader::get_index_path() const { return index_path; } void GzipReader::set_buffer_size(std::size_t size) { default_buffer_size = size; diff --git a/src/dftracer/utils/utilities/reader/internal/gzip_reader.h b/src/dftracer/utils/utilities/reader/internal/gzip_reader.h index 97abcc1a..e2be5acb 100644 --- a/src/dftracer/utils/utilities/reader/internal/gzip_reader.h +++ b/src/dftracer/utils/utilities/reader/internal/gzip_reader.h @@ -14,7 +14,7 @@ namespace dftracer::utils::utilities::reader::internal { class GzipReader : public Reader { public: - GzipReader(const std::string &gz_path, const std::string &idx_path, + GzipReader(const std::string &gz_path, const std::string &index_path, std::size_t index_ckpt_size = dftracer::utils::utilities:: indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE); explicit GzipReader( @@ -32,7 +32,7 @@ class GzipReader : public Reader { std::size_t get_max_bytes() const override; std::size_t get_num_lines() const override; const std::string &get_archive_path() const override; - const std::string &get_idx_path() const override; + const std::string &get_index_path() const override; void set_buffer_size(std::size_t size) override; coro::CoroTask read_async(std::size_t start_bytes, @@ -57,7 +57,7 @@ class GzipReader : public Reader { private: std::string gz_path; - std::string idx_path; + std::string index_path; bool is_open; std::size_t default_buffer_size; std::shared_ptr diff --git a/src/dftracer/utils/utilities/reader/internal/reader_c.cpp b/src/dftracer/utils/utilities/reader/internal/reader_c.cpp index 4cd25439..74256808 100644 --- a/src/dftracer/utils/utilities/reader/internal/reader_c.cpp +++ b/src/dftracer/utils/utilities/reader/internal/reader_c.cpp @@ -19,16 +19,18 @@ static int validate_handle(dft_reader_handle_t reader) { return reader ? 0 : -1; } -dft_reader_handle_t dft_reader_create(const char *gz_path, const char *idx_path, +dft_reader_handle_t dft_reader_create(const char *gz_path, + const char *index_path, size_t index_ckpt_size) { - if (!gz_path || !idx_path) { + if (!gz_path || !index_path) { DFTRACER_UTILS_LOG_ERROR("%s", - "Both gz_path and idx_path cannot be null"); + "Both gz_path and index_path cannot be null"); return nullptr; } try { - auto reader = ReaderFactory::create(gz_path, idx_path, index_ckpt_size); + auto reader = + ReaderFactory::create(gz_path, index_path, index_ckpt_size); // For C API, we need to transfer ownership - create a new shared_ptr on // heap return static_cast( diff --git a/src/dftracer/utils/utilities/reader/internal/reader_factory.cpp b/src/dftracer/utils/utilities/reader/internal/reader_factory.cpp index 5280f45a..577d6b99 100644 --- a/src/dftracer/utils/utilities/reader/internal/reader_factory.cpp +++ b/src/dftracer/utils/utilities/reader/internal/reader_factory.cpp @@ -11,7 +11,7 @@ namespace dftracer::utils::utilities::reader::internal { std::shared_ptr ReaderFactory::create(const std::string &archive_path, - const std::string &idx_path, + const std::string &index_path, std::size_t index_ckpt_size) { ArchiveFormat format = FormatDetector::detect(archive_path); @@ -21,11 +21,11 @@ std::shared_ptr ReaderFactory::create(const std::string &archive_path, switch (format) { case ArchiveFormat::GZIP: - return std::make_shared(archive_path, idx_path, + return std::make_shared(archive_path, index_path, index_ckpt_size); case ArchiveFormat::TAR_GZ: - return std::make_shared(archive_path, idx_path, + return std::make_shared(archive_path, index_path, index_ckpt_size); default: diff --git a/src/dftracer/utils/utilities/reader/internal/tar_reader.cpp b/src/dftracer/utils/utilities/reader/internal/tar_reader.cpp index 8d936b3e..4a45e541 100644 --- a/src/dftracer/utils/utilities/reader/internal/tar_reader.cpp +++ b/src/dftracer/utils/utilities/reader/internal/tar_reader.cpp @@ -1,7 +1,7 @@ +#include #include #include #include -#include #include #include #include @@ -15,10 +15,28 @@ using namespace dftracer::utils::utilities::indexer::internal::tar; namespace dftracer::utils::utilities::reader::internal { +namespace { + +std::string normalize_idx_path(const std::string &path) { + fs::path input(path); + if (input.filename() == ".dftindex") { + return input.string(); + } + if (input.parent_path().filename() == ".dftindex") { + return input.parent_path().string(); + } + if (input.has_extension()) { + return (input.parent_path() / ".dftindex").string(); + } + return (input / ".dftindex").string(); +} + +} // namespace + TarReader::TarReader(const std::string &tar_gz_path_, const std::string &idx_path_, std::size_t index_ckpt_size) : tar_gz_path(tar_gz_path_), - idx_path(idx_path_), + index_path(normalize_idx_path(idx_path_)), is_open(false), default_buffer_size(DEFAULT_TAR_READER_BUFFER_SIZE), logical_mapping_cached(false), @@ -26,14 +44,14 @@ TarReader::TarReader(const std::string &tar_gz_path_, cached_total_logical_lines(0) { try { printf("Creating TAR reader for gz: %s and index: %s\n", - tar_gz_path.c_str(), idx_path.c_str()); - indexer = std::make_shared(tar_gz_path, idx_path, + tar_gz_path.c_str(), index_path.c_str()); + indexer = std::make_shared(tar_gz_path, index_path, index_ckpt_size, false); is_open = true; DFTRACER_UTILS_LOG_DEBUG( "Successfully created TAR reader for gz: %s and index: %s", - tar_gz_path.c_str(), idx_path.c_str()); + tar_gz_path.c_str(), index_path.c_str()); } catch (const std::exception &e) { throw std::runtime_error( "Failed to initialize TAR reader with indexer: " + @@ -52,14 +70,14 @@ TarReader::TarReader(std::shared_ptr indexer_) } is_open = true; tar_gz_path = indexer->get_tar_gz_path(); - idx_path = indexer->get_idx_path(); + index_path = indexer->get_index_path(); } TarReader::~TarReader() = default; TarReader::TarReader(TarReader &&other) noexcept : tar_gz_path(std::move(other.tar_gz_path)), - idx_path(std::move(other.idx_path)), + index_path(std::move(other.index_path)), is_open(other.is_open), default_buffer_size(other.default_buffer_size), indexer(std::move(other.indexer)), @@ -74,7 +92,7 @@ TarReader::TarReader(TarReader &&other) noexcept TarReader &TarReader::operator=(TarReader &&other) noexcept { if (this != &other) { tar_gz_path = std::move(other.tar_gz_path); - idx_path = std::move(other.idx_path); + index_path = std::move(other.index_path); is_open = other.is_open; default_buffer_size = other.default_buffer_size; indexer = std::move(other.indexer); @@ -104,7 +122,7 @@ std::string TarReader::get_format_name() const { return "TAR.GZ"; } const std::string &TarReader::get_archive_path() const { return tar_gz_path; } -const std::string &TarReader::get_idx_path() const { return idx_path; } +const std::string &TarReader::get_index_path() const { return index_path; } void TarReader::set_buffer_size(std::size_t size) { default_buffer_size = size; diff --git a/src/dftracer/utils/utilities/reader/internal/tar_reader.h b/src/dftracer/utils/utilities/reader/internal/tar_reader.h index 96564357..d3608a3c 100644 --- a/src/dftracer/utils/utilities/reader/internal/tar_reader.h +++ b/src/dftracer/utils/utilities/reader/internal/tar_reader.h @@ -43,7 +43,7 @@ class TarReader : public Reader { estimated_lines; // Estimated number of lines in this file }; - TarReader(const std::string &tar_gz_path, const std::string &idx_path, + TarReader(const std::string &tar_gz_path, const std::string &index_path, std::size_t index_ckpt_size = dftracer::utils::utilities:: indexer::internal::tar::TarIndexer::DEFAULT_CHECKPOINT_SIZE); explicit TarReader( @@ -63,7 +63,7 @@ class TarReader : public Reader { std::size_t get_max_bytes() const override; std::size_t get_num_lines() const override; const std::string &get_archive_path() const override; - const std::string &get_idx_path() const override; + const std::string &get_index_path() const override; void set_buffer_size(std::size_t size) override; coro::CoroTask read_async(std::size_t start_bytes, @@ -108,7 +108,7 @@ class TarReader : public Reader { private: std::string tar_gz_path; - std::string idx_path; + std::string index_path; bool is_open; std::size_t default_buffer_size; std::shared_ptr< diff --git a/src/dftracer/utils/utilities/reader/trace_reader.cpp b/src/dftracer/utils/utilities/reader/trace_reader.cpp index d59097db..d3ddef20 100644 --- a/src/dftracer/utils/utilities/reader/trace_reader.cpp +++ b/src/dftracer/utils/utilities/reader/trace_reader.cpp @@ -52,10 +52,12 @@ TraceReader::TraceReader(TraceReaderConfig config) } void TraceReader::probe_index() { - idx_path_ = dft_internal::determine_index_path(config_.file_path, - config_.index_dir); - has_index_ = fs::exists(idx_path_); format_ = IndexerFactory::detect_format(config_.file_path); + index_path_ = dft_internal::determine_index_path(config_.file_path, + config_.index_dir); + has_index_ = + (format_ == ArchiveFormat::GZIP || format_ == ArchiveFormat::TAR_GZ) && + fs::exists(index_path_); } bool TraceReader::has_index() const { return has_index_; } @@ -91,7 +93,7 @@ std::size_t TraceReader::get_num_lines() { } std::shared_ptr TraceReader::create_indexed_reader() { - auto indexer = IndexerFactory::create(config_.file_path, idx_path_, + auto indexer = IndexerFactory::create(config_.file_path, index_path_, config_.checkpoint_size, false); return internal::ReaderFactory::create(indexer); } @@ -136,10 +138,10 @@ coro::AsyncGenerator TraceReader::read_lines(ReadConfig config) { if (start >= max_bytes) co_return; } - if (query && !idx_path_.empty() && + if (has_index_ && query && !index_path_.empty() && range_type == internal::RangeType::BYTE_RANGE) { - ChunkPrunerInput pruner_input{idx_path_, config_.file_path, *query, - nullptr}; + ChunkPrunerInput pruner_input{index_path_, config_.file_path, + *query, nullptr}; ChunkPrunerUtility pruner; auto pruner_out = co_await pruner.process(pruner_input); if (pruner_out.success && !pruner_out.file_may_match) { @@ -234,11 +236,11 @@ coro::AsyncGenerator> TraceReader::read_raw( if (start >= max_bytes) co_return; } - if (!config.query.empty() && !idx_path_.empty() && + if (has_index_ && !config.query.empty() && !index_path_.empty() && range_type == internal::RangeType::BYTE_RANGE) { auto parsed = Query::from_string(config.query); if (!parsed) throw common::query::QueryParseError(parsed.error()); - ChunkPrunerInput pruner_input{idx_path_, config_.file_path, + ChunkPrunerInput pruner_input{index_path_, config_.file_path, std::move(*parsed), nullptr}; ChunkPrunerUtility pruner; auto pruner_out = co_await pruner.process(pruner_input); diff --git a/src/dftracer/utils/utilities/replay/replay.cpp b/src/dftracer/utils/utilities/replay/replay.cpp index c5ae7c33..2b34e22b 100644 --- a/src/dftracer/utils/utilities/replay/replay.cpp +++ b/src/dftracer/utils/utilities/replay/replay.cpp @@ -403,13 +403,13 @@ ReplayResult ReplayEngine::replay(const std::string& trace_file, if (is_compressed) { // Handle compressed files with ReaderFactory - std::string idx_path = + std::string index_path = index_file.empty() ? utilities::composites::dft::internal:: determine_index_path(trace_file, "") : index_file; auto reader = - reader::internal::ReaderFactory::create(trace_file, idx_path); + reader::internal::ReaderFactory::create(trace_file, index_path); if (!reader) { result.error_messages.push_back( diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ff7aca05..e23f12fb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -78,6 +78,7 @@ foreach(test_file ${TEST_CPP_SOURCES}) target_link_libraries(${target_name} PRIVATE doctest::doctest testing_utilities) target_set_warnings(${target_name}) target_enable_coroutine(${target_name}) + target_add_rpath(${target_name}) # Pass CMAKE_BINARY_DIR to tests that need to execute binaries target_compile_definitions(${target_name} PRIVATE CMAKE_BINARY_DIR="${CMAKE_BINARY_DIR}") @@ -155,6 +156,7 @@ foreach (test_file ${TEST_C_SOURCES}) add_executable(${target_name} ${test_file}) target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${target_name} PRIVATE unity_lib testing_utilities) + target_add_rpath(${target_name}) # Set output directory to preserve folder structure in binaries get_filename_component(bin_dir ${bin_exec} DIRECTORY) @@ -216,6 +218,7 @@ foreach(test_file ${TEST_BINARY_SOURCES}) target_link_libraries(${target_name} PRIVATE doctest::doctest testing_utilities) target_set_warnings(${target_name}) target_enable_coroutine(${target_name}) + target_add_rpath(${target_name}) get_filename_component(bin_dir ${bin_exec} DIRECTORY) get_filename_component(bin_name ${bin_exec} NAME) diff --git a/tests/binaries/test_dftracer_index.cpp b/tests/binaries/test_dftracer_index.cpp index 11166154..1e0a7608 100644 --- a/tests/binaries/test_dftracer_index.cpp +++ b/tests/binaries/test_dftracer_index.cpp @@ -1,5 +1,6 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include +#include #include #include #include @@ -16,6 +17,14 @@ namespace { +void set_test_library_path(const std::string& binary) { + const fs::path build_root = fs::path(binary).parent_path().parent_path(); + const std::string lib_path = + (build_root / "lib").string() + ":" + + (build_root / "_deps" / "rocksdb-build").string(); + ::setenv("LD_LIBRARY_PATH", lib_path.c_str(), 1); +} + std::string create_pfw_gz(dft_utils_test::TestEnvironment& env, int num_events, int id) { auto trace_gz = env.create_dft_test_gzip_file(num_events); @@ -46,6 +55,7 @@ int run_index(const std::string& binary, const std::vector& args) { pid_t pid = ::fork(); if (pid < 0) return -1; if (pid == 0) { + set_test_library_path(binary); std::vector argv; argv.push_back(binary.c_str()); for (const auto& arg : args) argv.push_back(arg.c_str()); @@ -59,21 +69,6 @@ int run_index(const std::string& binary, const std::vector& args) { return -1; } -// Scan a directory for any file whose name ends with the given suffix. -bool has_file_with_suffix(const std::string& dir, const std::string& suffix) { - if (!fs::exists(dir) || !fs::is_directory(dir)) return false; - for (const auto& entry : fs::directory_iterator(dir)) { - if (!entry.is_regular_file()) continue; - const auto name = entry.path().filename().string(); - if (name.size() >= suffix.size() && - name.compare(name.size() - suffix.size(), suffix.size(), suffix) == - 0) { - return true; - } - } - return false; -} - } // namespace // ============================================================================ @@ -105,12 +100,11 @@ TEST_SUITE("DFTracerIndex") { auto f = create_pfw_gz(env, 100, 0); REQUIRE(!f.empty()); - // Path convention: file.pfw.gz -> file.pfw.gz.idx (same directory). int rc = run_index(binary, {"-d", env.get_dir(), "--force"}); CHECK(rc == 0); - // The .idx sidecar must appear next to the input file. - CHECK(fs::exists(f + ".idx")); + CHECK(fs::exists(dftracer::utils::utilities::composites::dft::internal:: + determine_index_path(f, ""))); } TEST_CASE("build index with custom index-dir") { @@ -134,8 +128,8 @@ TEST_SUITE("DFTracerIndex") { binary, {"-d", env.get_dir(), "--force", "--index-dir", idx_dir}); CHECK(rc == 0); - // A .idx file must appear somewhere inside idx_dir. - CHECK(has_file_with_suffix(idx_dir, ".idx")); + CHECK(fs::exists(dftracer::utils::utilities::composites::dft::internal:: + determine_index_path(f, idx_dir))); } TEST_CASE("build with manifest creates idx") { @@ -155,8 +149,8 @@ TEST_SUITE("DFTracerIndex") { run_index(binary, {"-d", env.get_dir(), "--force", "--manifest"}); CHECK(rc == 0); - // The sidecar must be created. - CHECK(fs::exists(f + ".idx")); + CHECK(fs::exists(dftracer::utils::utilities::composites::dft::internal:: + determine_index_path(f, ""))); } TEST_CASE("force rebuild runs twice without error") { @@ -174,11 +168,13 @@ TEST_SUITE("DFTracerIndex") { int rc1 = run_index(binary, {"-d", env.get_dir(), "--force"}); CHECK(rc1 == 0); - REQUIRE(fs::exists(f + ".idx")); + REQUIRE(fs::exists(dftracer::utils::utilities::composites::dft:: + internal::determine_index_path(f, ""))); // Second run with --force must overwrite successfully. int rc2 = run_index(binary, {"-d", env.get_dir(), "--force"}); CHECK(rc2 == 0); - CHECK(fs::exists(f + ".idx")); + CHECK(fs::exists(dftracer::utils::utilities::composites::dft::internal:: + determine_index_path(f, ""))); } } diff --git a/tests/binaries/test_dftracer_info.cpp b/tests/binaries/test_dftracer_info.cpp index 9b5679e4..669b8531 100644 --- a/tests/binaries/test_dftracer_info.cpp +++ b/tests/binaries/test_dftracer_info.cpp @@ -16,6 +16,14 @@ namespace { +void set_test_library_path(const std::string& binary) { + const fs::path build_root = fs::path(binary).parent_path().parent_path(); + const std::string lib_path = + (build_root / "lib").string() + ":" + + (build_root / "_deps" / "rocksdb-build").string(); + ::setenv("LD_LIBRARY_PATH", lib_path.c_str(), 1); +} + std::string create_pfw_gz(dft_utils_test::TestEnvironment& env, int num_events, int id) { auto trace_gz = env.create_dft_test_gzip_file(num_events); @@ -46,6 +54,7 @@ int run_info(const std::string& binary, const std::vector& args) { pid_t pid = ::fork(); if (pid < 0) return -1; if (pid == 0) { + set_test_library_path(binary); std::vector argv; argv.push_back(binary.c_str()); for (const auto& arg : args) argv.push_back(arg.c_str()); @@ -72,6 +81,7 @@ std::string run_info_capture(const std::string& binary, return ""; } if (pid == 0) { + set_test_library_path(binary); ::close(pipefd[0]); ::dup2(pipefd[1], STDOUT_FILENO); ::dup2(pipefd[1], STDERR_FILENO); diff --git a/tests/binaries/test_dftracer_organize.cpp b/tests/binaries/test_dftracer_organize.cpp index ef645928..e45afdf6 100644 --- a/tests/binaries/test_dftracer_organize.cpp +++ b/tests/binaries/test_dftracer_organize.cpp @@ -17,6 +17,14 @@ namespace { +void set_test_library_path(const std::string& binary) { + const fs::path build_root = fs::path(binary).parent_path().parent_path(); + const std::string lib_path = + (build_root / "lib").string() + ":" + + (build_root / "_deps" / "rocksdb-build").string(); + ::setenv("LD_LIBRARY_PATH", lib_path.c_str(), 1); +} + std::string create_pfw_gz(dft_utils_test::TestEnvironment& env, int num_events, int id) { auto trace_gz = env.create_dft_test_gzip_file(num_events); @@ -63,6 +71,7 @@ int run_binary(const std::string& binary, pid_t pid = ::fork(); if (pid < 0) return -1; if (pid == 0) { + set_test_library_path(binary); std::vector argv; argv.push_back(binary.c_str()); for (const auto& arg : args) argv.push_back(arg.c_str()); @@ -120,6 +129,16 @@ bool any_file_with_suffix(const std::string& dir, const std::string& suffix) { return false; } +bool any_dir_named(const std::string& dir, const std::string& name) { + if (!fs::exists(dir)) return false; + for (const auto& entry : fs::recursive_directory_iterator(dir)) { + if (entry.is_directory() && entry.path().filename() == name) { + return true; + } + } + return false; +} + } // namespace // ============================================================================ @@ -175,7 +194,7 @@ TEST_SUITE("DFTracerOrganize") { CHECK(has_output); } - TEST_CASE("organize creates midx sidecar") { + TEST_CASE("organize creates .dftindex store") { auto binary = find_organize_binary(); if (binary.empty()) { MESSAGE("dftracer_organize binary not found, skipping."); @@ -195,8 +214,7 @@ TEST_SUITE("DFTracerOrganize") { "--groups", R"(io:cat == "POSIX")"}); CHECK(rc == 0); - // The organizer builds .pidx sidecars in the output directory. - CHECK(any_file_with_suffix(out_dir, ".pidx")); + CHECK(any_dir_named(out_dir, ".dftindex")); } TEST_CASE("reconstruct from organized") { @@ -225,7 +243,7 @@ TEST_SUITE("DFTracerOrganize") { "--groups", R"(io:cat == "POSIX")"}); REQUIRE(rc_org == 0); - // Reconstruct needs the .pidx sidecars in the organized dir. + REQUIRE(any_dir_named(org_dir, ".dftindex")); int rc_rec = run_binary( rec_binary, {"-d", org_dir, "-o", rec_dir, "--no-compress"}); CHECK(rc_rec == 0); diff --git a/tests/binaries/test_dftracer_server.cpp b/tests/binaries/test_dftracer_server.cpp index 16bdca1a..292f4097 100644 --- a/tests/binaries/test_dftracer_server.cpp +++ b/tests/binaries/test_dftracer_server.cpp @@ -81,6 +81,24 @@ bool port_is_listening(int port, int timeout_ms = 100) { return result == 0; } +bool can_bind_local_tcp_socket() { + int sock = ::socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) return false; + + int opt = 1; + ::setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + struct sockaddr_in addr{}; + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + addr.sin_port = htons(0); + + const int rc = + ::bind(sock, reinterpret_cast(&addr), sizeof(addr)); + ::close(sock); + return rc == 0; +} + /// Wait until port is listening or timeout expires. bool wait_for_port(int port, int timeout_s = 10) { auto deadline = @@ -188,15 +206,6 @@ std::string extract_body(const std::string& response) { /// Pick a random port in the ephemeral range. int pick_port() { return 10000 + (::getpid() % 50000); } -bool tcp_sockets_available() { - int sock = ::socket(AF_INET, SOCK_STREAM, 0); - if (sock >= 0) { - ::close(sock); - return true; - } - return false; -} - /// RAII server process manager. struct ServerProcess { pid_t pid = -1; @@ -264,8 +273,8 @@ TEST_CASE("DFTracer Server - start and respond to endpoints") { MESSAGE("dftracer_server binary not found, skipping."); return; } - if (!tcp_sockets_available()) { - MESSAGE("TCP sockets are unavailable in this environment, skipping."); + if (!can_bind_local_tcp_socket()) { + MESSAGE("local TCP bind is unavailable in this environment, skipping."); return; } @@ -558,8 +567,8 @@ TEST_CASE("DFTracer Server - graceful shutdown via SIGTERM") { MESSAGE("dftracer_server binary not found, skipping."); return; } - if (!tcp_sockets_available()) { - MESSAGE("TCP sockets are unavailable in this environment, skipping."); + if (!can_bind_local_tcp_socket()) { + MESSAGE("local TCP bind is unavailable in this environment, skipping."); return; } diff --git a/tests/binaries/test_dftracer_tar.cpp b/tests/binaries/test_dftracer_tar.cpp index 19fc1ed6..5782fa24 100644 --- a/tests/binaries/test_dftracer_tar.cpp +++ b/tests/binaries/test_dftracer_tar.cpp @@ -1,5 +1,6 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include +#include #include #include #include @@ -15,6 +16,14 @@ namespace { +void set_test_library_path(const std::string& binary) { + const fs::path build_root = fs::path(binary).parent_path().parent_path(); + const std::string lib_path = + (build_root / "lib").string() + ":" + + (build_root / "_deps" / "rocksdb-build").string(); + ::setenv("LD_LIBRARY_PATH", lib_path.c_str(), 1); +} + std::string find_tar_binary() { const char* env_path = std::getenv("DFTRACER_TAR_PATH"); if (env_path != nullptr && ::access(env_path, X_OK) == 0) return env_path; @@ -34,6 +43,7 @@ int run_tar(const std::string& binary, const std::vector& args) { pid_t pid = ::fork(); if (pid < 0) return -1; if (pid == 0) { + set_test_library_path(binary); std::vector argv; argv.push_back(binary.c_str()); for (const auto& arg : args) argv.push_back(arg.c_str()); @@ -60,6 +70,7 @@ std::string run_tar_capture(const std::string& binary, return ""; } if (pid == 0) { + set_test_library_path(binary); ::close(pipefd[0]); ::dup2(pipefd[1], STDOUT_FILENO); ::dup2(pipefd[1], STDERR_FILENO); @@ -172,9 +183,9 @@ TEST_SUITE("DFTracerTar") { int rc = run_tar(binary, {tar_gz, "--build-only"}); CHECK(rc == 0); - // The indexer creates a .idx.tar sidecar alongside the archive. - std::string sidecar = tar_gz + ".idx.tar"; - CHECK(fs::exists(sidecar)); + std::string db_root = dftracer::utils::utilities::composites::dft:: + internal::determine_index_path(tar_gz, ""); + CHECK(fs::exists(db_root)); } TEST_CASE("force rebuild") { @@ -196,12 +207,13 @@ TEST_SUITE("DFTracerTar") { int rc1 = run_tar(binary, {tar_gz, "--build-only"}); REQUIRE(rc1 == 0); - std::string sidecar = tar_gz + ".idx.tar"; - REQUIRE(fs::exists(sidecar)); + std::string db_root = dftracer::utils::utilities::composites::dft:: + internal::determine_index_path(tar_gz, ""); + REQUIRE(fs::exists(db_root)); - // Force rebuild must also succeed and leave the sidecar intact. + // Force rebuild must also succeed and leave the DB root intact. int rc2 = run_tar(binary, {tar_gz, "--build-only", "--force-rebuild"}); CHECK(rc2 == 0); - CHECK(fs::exists(sidecar)); + CHECK(fs::exists(db_root)); } } diff --git a/tests/python/common.py b/tests/python/common.py index a00f1bc6..82f4aa15 100644 --- a/tests/python/common.py +++ b/tests/python/common.py @@ -3,6 +3,7 @@ Common test utilities for Python bindings tests """ +import gc import gzip import os import shutil @@ -13,6 +14,12 @@ import dftracer.utils as dft_utils +def determine_index_path(file_path: str, index_dir: str = "") -> str: + if index_dir: + return os.path.join(index_dir, ".dftindex") + return os.path.join(os.path.dirname(file_path), ".dftindex") + + class Environment: """Shared test environment manager for tests""" @@ -34,13 +41,14 @@ def __exit__(self, exc_type, exc_val, exc_tb): def cleanup(self): """Clean up temporary files and directory""" + gc.collect() for file_path in self.test_files: try: if os.path.exists(file_path): os.remove(file_path) - idx_path = file_path + ".idx" - if os.path.exists(idx_path): - os.remove(idx_path) + index_path = determine_index_path(file_path, "") + if os.path.isdir(index_path): + shutil.rmtree(index_path) except OSError: pass @@ -49,10 +57,12 @@ def cleanup(self): shutil.rmtree(self.temp_dir) except OSError: pass + gc.collect() def create_test_gzip_file(self, filename="test_data.pfw.gz", bytes_per_line=1024): """Create a test gzip file with valid DFTracer trace events""" file_path = os.path.join(self.temp_dir, filename) + os.makedirs(os.path.dirname(file_path), exist_ok=True) io_names = ["read", "write", "open", "close", "pread", "pwrite", "fread", "fwrite"] cats = ["POSIX", "POSIX", "POSIX", "POSIX", "POSIX", "POSIX", "STDIO", "STDIO"] @@ -96,6 +106,7 @@ def create_test_gzip_file(self, filename="test_data.pfw.gz", bytes_per_line=1024 def create_dft_trace_file(self, filename="dft_trace.pfw.gz", num_events=None): """Create a gzip file with valid DFTracer trace events.""" file_path = os.path.join(self.temp_dir, filename) + os.makedirs(os.path.dirname(file_path), exist_ok=True) n = num_events if num_events is not None else self.lines io_names = ["read", "write", "open", "close", "pread", "pwrite", "fread", "fwrite"] cats = ["POSIX", "POSIX", "POSIX", "POSIX", "POSIX", "POSIX", "STDIO", "STDIO"] @@ -198,25 +209,24 @@ def create_test_gzip_file_with_nested_json(self): return file_path def get_index_path(self, gz_file_path): - """Get the index file path for a gzip file""" - return gz_file_path + ".idx" + """Get the `.dftindex` path for a gzip file.""" + return determine_index_path(gz_file_path, "") def build_index(self, gz_file_path, checkpoint_size_bytes=None): """Build index for the gzip file using Python indexer""" if checkpoint_size_bytes is None: checkpoint_size_bytes = 32 * 1024 * 1024 # 32MB default - idx_file = self.get_index_path(gz_file_path) + index_path = self.get_index_path(gz_file_path) try: - # Use the indexer API - indexer = dft_utils.Indexer(gz_file_path, idx_file, checkpoint_size_bytes) - if indexer.need_rebuild(): - indexer.build() + with dft_utils.Indexer(gz_file_path, index_path, checkpoint_size_bytes) as indexer: + if indexer.need_rebuild(): + indexer.build() - if not os.path.exists(idx_file): - pytest.skip("Index file was not created") - return idx_file + if not os.path.exists(index_path): + pytest.skip("Index store was not created") + return index_path except Exception as e: pytest.skip(f"Failed to build index: {e}") diff --git a/tests/python/test_dask.py b/tests/python/test_dask.py index 6ce64544..4ebd6a9f 100644 --- a/tests/python/test_dask.py +++ b/tests/python/test_dask.py @@ -38,21 +38,21 @@ def test_parallel_indexer_creation(self): # Create multiple test files gz_files = [] for i in range(3): - gz_file = env.create_test_gzip_file(f"test_{i}.pfw.gz", bytes_per_line=512) + gz_file = env.create_test_gzip_file(f"file_{i}/test_{i}.pfw.gz", bytes_per_line=512) gz_files.append(gz_file) def create_and_build_indexer(gz_file): """Helper function to create and build an indexer""" try: - indexer = dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) - if indexer.need_rebuild(): - indexer.build() - return { - "file": gz_file, - "max_bytes": indexer.get_max_bytes(), - "num_lines": indexer.get_num_lines(), - "success": True, - } + with dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) as indexer: + if indexer.need_rebuild(): + indexer.build() + return { + "file": gz_file, + "max_bytes": indexer.get_max_bytes(), + "num_lines": indexer.get_num_lines(), + "success": True, + } except Exception as e: return {"file": gz_file, "error": str(e), "success": False} @@ -69,9 +69,9 @@ def create_and_build_indexer(gz_file): assert result["max_bytes"] > 0 assert result["num_lines"] > 0 - # Verify index file was created - idx_file = result["file"] + ".idx" - assert os.path.exists(idx_file) + # Verify index store was created + index_path = env.get_index_path(result["file"]) + assert os.path.exists(index_path) def test_parallel_reader_operations(self): """Test parallel reading operations with all reader types including JSON""" @@ -83,16 +83,19 @@ def read_chunk(gz_file_path, start_bytes, end_bytes, reader_type): """Helper function to read a chunk - creates its own indexer for thread safety""" try: # Each task creates its own indexer instance to avoid sharing - reader = dft_utils.TraceReader(gz_file_path) - - if reader_type == "bytes": - data = b"".join(reader.read_raw(start_byte=start_bytes, end_byte=end_bytes)) - elif reader_type == "line_bytes": - data = reader.read_lines(start_byte=start_bytes, end_byte=end_bytes) - elif reader_type == "json_bytes": - data = reader.read_lines_json(start_byte=start_bytes, end_byte=end_bytes) - else: - raise ValueError(f"Unknown reader type: {reader_type}") + with dft_utils.TraceReader(gz_file_path) as reader: + if reader_type == "bytes": + data = b"".join( + reader.read_raw(start_byte=start_bytes, end_byte=end_bytes) + ) + elif reader_type == "line_bytes": + data = reader.read_lines(start_byte=start_bytes, end_byte=end_bytes) + elif reader_type == "json_bytes": + data = reader.read_lines_json( + start_byte=start_bytes, end_byte=end_bytes + ) + else: + raise ValueError(f"Unknown reader type: {reader_type}") return { "type": reader_type, @@ -104,8 +107,8 @@ def read_chunk(gz_file_path, start_bytes, end_bytes, reader_type): return {"type": reader_type, "error": str(e), "success": False} # Get file info from a temporary indexer - temp_indexer = dft_utils.Indexer(gz_file, checkpoint_size=512 * 1024) - max_bytes = temp_indexer.get_max_bytes() + with dft_utils.Indexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer: + max_bytes = temp_indexer.get_max_bytes() chunk_size = max_bytes // 4 # Create tasks for all reader types @@ -159,10 +162,10 @@ def test_dask_dataframe_integration(self): def extract_json_data(gz_file_path, start_bytes, end_bytes): """Extract JSON data and convert to DataFrame-friendly format""" try: - reader = dft_utils.TraceReader(gz_file_path) - json_objects = reader.read_lines_json( - start_byte=start_bytes, end_byte=end_bytes - ) + with dft_utils.TraceReader(gz_file_path) as reader: + json_objects = reader.read_lines_json( + start_byte=start_bytes, end_byte=end_bytes + ) # Convert to list of dictionaries suitable for DataFrame records = [] @@ -181,8 +184,8 @@ def extract_json_data(gz_file_path, start_bytes, end_bytes): return [] # Get file info and create chunks - temp_indexer = dft_utils.Indexer(gz_file, checkpoint_size=512 * 1024) - max_bytes = temp_indexer.get_max_bytes() + with dft_utils.Indexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer: + max_bytes = temp_indexer.get_max_bytes() chunk_size = max_bytes // 4 # Create delayed tasks to extract data from each chunk @@ -226,8 +229,8 @@ def test_multiple_batch_sizes_no_duplication(self): gz_file = env.create_test_gzip_file(bytes_per_line=512) env.build_index(gz_file, checkpoint_size_bytes=256 * 1024) - temp_indexer = dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) - max_bytes = temp_indexer.get_max_bytes() + with dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer: + max_bytes = temp_indexer.get_max_bytes() # Test various batch sizes including boundary-critical ones batch_sizes = [ @@ -246,8 +249,8 @@ def generate_batches(filename, max_bytes, batch_size): def process_batch(batch_info): """Process one batch and return processed records""" filename, start, end = batch_info - reader = dft_utils.TraceReader(filename) - json_lines = reader.read_lines_json(start_byte=start, end_byte=end) + with dft_utils.TraceReader(filename) as reader: + json_lines = reader.read_lines_json(start_byte=start, end_byte=end) processed_records = [] for json_obj in json_lines: @@ -265,8 +268,8 @@ def process_batch(batch_info): return processed_records # Get reference data (full file read) and verify against environment - full_reader = dft_utils.TraceReader(gz_file) - reference_data = full_reader.read_lines_json(start_byte=0, end_byte=max_bytes) + with dft_utils.TraceReader(gz_file) as full_reader: + reference_data = full_reader.read_lines_json(start_byte=0, end_byte=max_bytes) reference_names = sorted( [obj["name"] for obj in reference_data if obj and "name" in obj] ) @@ -363,14 +366,14 @@ def test_boundary_edge_cases(self): gz_file = env.create_test_gzip_file(bytes_per_line=512) env.build_index(gz_file, checkpoint_size_bytes=256 * 1024) - temp_indexer = dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) - max_bytes = temp_indexer.get_max_bytes() + with dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer: + max_bytes = temp_indexer.get_max_bytes() def process_batch(batch_info): """Process one batch and return processed records""" filename, start, end = batch_info - reader = dft_utils.TraceReader(filename) - json_lines = reader.read_lines_json(start_byte=start, end_byte=end) + with dft_utils.TraceReader(filename) as reader: + json_lines = reader.read_lines_json(start_byte=start, end_byte=end) processed_records = [] for json_obj in json_lines: @@ -385,8 +388,8 @@ def process_batch(batch_info): return processed_records # Get reference data and verify against environment - full_reader = dft_utils.TraceReader(gz_file) - reference_data = full_reader.read_lines_json(start_byte=0, end_byte=max_bytes) + with dft_utils.TraceReader(gz_file) as full_reader: + reference_data = full_reader.read_lines_json(start_byte=0, end_byte=max_bytes) expected_count = len([obj for obj in reference_data if obj and "name" in obj]) assert expected_count == env.lines, ( diff --git a/tests/python/test_indexer.py b/tests/python/test_indexer.py index c5d438ed..8079c833 100644 --- a/tests/python/test_indexer.py +++ b/tests/python/test_indexer.py @@ -19,12 +19,12 @@ def test_indexer_creation(self): """Test indexer creation""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) # Test basic creation using context manager - with dft_utils.Indexer(gz_file, idx_file) as indexer: + with dft_utils.Indexer(gz_file, index_path) as indexer: assert indexer.gz_path == gz_file - assert indexer.idx_path == idx_file + assert indexer.index_path == index_path assert indexer.checkpoint_size > 0 def test_indexer_creation_with_defaults(self): @@ -35,7 +35,7 @@ def test_indexer_creation_with_defaults(self): # Test creation with defaults using context manager with dft_utils.Indexer(gz_file) as indexer: assert indexer.gz_path == gz_file - assert indexer.idx_path == gz_file + ".idx" + assert indexer.index_path == env.get_index_path(gz_file) assert indexer.checkpoint_size <= 33554432 # Should be <= 32MB default def test_indexer_custom_checkpoint_size(self): @@ -57,9 +57,9 @@ def test_indexer_build_and_rebuild(self): """Test indexer build and rebuild functionality""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) - with dft_utils.Indexer(gz_file, idx_file) as indexer: + with dft_utils.Indexer(gz_file, index_path) as indexer: # Should need rebuild initially assert indexer.need_rebuild() @@ -67,7 +67,7 @@ def test_indexer_build_and_rebuild(self): indexer.build() # Index file should exist - assert os.path.exists(idx_file) + assert os.path.exists(index_path) # Should not need rebuild after building assert not indexer.need_rebuild() @@ -75,7 +75,7 @@ def test_indexer_build_and_rebuild(self): # Test force rebuild with a new indexer # Note: force_rebuild affects the build process, not need_rebuild() check # The need_rebuild() method checks file consistency, not force_rebuild flag - with dft_utils.Indexer(gz_file, idx_file, force_rebuild=True) as indexer_force: + with dft_utils.Indexer(gz_file, index_path, force_rebuild=True) as indexer_force: # Since the index already exists and file hasn't changed, need_rebuild should be False # But force_rebuild will cause a rebuild when build() is called assert not indexer_force.need_rebuild() @@ -205,7 +205,7 @@ def test_indexer_with_reader_creation(self): if indexer.need_rebuild(): indexer.build() - # Test creating reader after indexer builds sidecar + # Test creating reader after indexer builds the shared index store reader = dft_utils.TraceReader(gz_file) assert reader.get_max_bytes() > 0 assert reader.file_path == gz_file @@ -220,7 +220,7 @@ def test_indexer_with_reader_creation_context_manager(self): if indexer.need_rebuild(): indexer.build() - # Test creating reader after indexer builds sidecar + # Test creating reader after indexer builds the shared index store reader = dft_utils.TraceReader(gz_file) assert reader.get_max_bytes() > 0 @@ -234,7 +234,7 @@ def test_multiple_readers_same_indexer(self): if indexer.need_rebuild(): indexer.build() - # Create multiple readers (all use same sidecar) + # Create multiple readers (all use the same shared index store) readers = [] for i in range(3): reader = dft_utils.TraceReader(gz_file) @@ -247,6 +247,44 @@ def test_multiple_readers_same_indexer(self): assert reader.get_max_bytes() == max_bytes +class TestIndexerLifetime: + """Python wrapper lifetime should not own the shared index store.""" + + def test_indexer_close_releases_wrapper_not_index_store(self): + """close() should release the Python handle without deleting .dftindex.""" + with Environment() as env: + gz_file = env.create_test_gzip_file() + index_path = env.get_index_path(gz_file) + + indexer = dft_utils.Indexer(gz_file, index_path) + assert indexer.need_rebuild() + indexer.build() + assert os.path.exists(index_path) + + indexer.close() + assert os.path.exists(index_path) + + with dft_utils.Indexer(gz_file, index_path) as reopened: + assert not reopened.need_rebuild() + assert reopened.get_num_lines() > 0 + + def test_indexer_context_exit_keeps_shared_index_store(self): + """Context exit should not tear down the shared index store.""" + with Environment() as env: + gz_file = env.create_test_gzip_file() + index_path = env.get_index_path(gz_file) + + with dft_utils.Indexer(gz_file, index_path) as indexer: + if indexer.need_rebuild(): + indexer.build() + assert indexer.get_num_lines() > 0 + + assert os.path.exists(index_path) + + reader = dft_utils.TraceReader(gz_file) + assert reader.get_num_lines() > 0 + + class TestIndexerUnified: """Test unified IndexBuilder features via Python Indexer""" @@ -254,9 +292,9 @@ def test_indexer_build_bloom(self): """Test building with bloom=True""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() assert indexer.has_bloom @@ -265,9 +303,9 @@ def test_indexer_build_manifest(self): """Test building with manifest=True""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_manifest=True, index_threshold=0 + gz_file, index_path, build_manifest=True, index_threshold=0 ) as indexer: indexer.build() assert indexer.has_manifest @@ -276,10 +314,10 @@ def test_indexer_build_bloom_and_manifest(self): """Test building with both bloom and manifest""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, build_manifest=True, index_threshold=0, @@ -292,8 +330,8 @@ def test_indexer_no_bloom_by_default(self): """Test that bloom is not built when build_bloom is omitted""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" - with dft_utils.Indexer(gz_file, idx_file, index_threshold=0) as indexer: + index_path = env.get_index_path(gz_file) + with dft_utils.Indexer(gz_file, index_path, index_threshold=0) as indexer: indexer.build() assert not indexer.has_bloom @@ -301,8 +339,8 @@ def test_indexer_no_manifest_by_default(self): """Test that manifest is not built when build_manifest is omitted""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" - with dft_utils.Indexer(gz_file, idx_file, index_threshold=0) as indexer: + index_path = env.get_index_path(gz_file) + with dft_utils.Indexer(gz_file, index_path, index_threshold=0) as indexer: indexer.build() assert not indexer.has_manifest @@ -310,9 +348,9 @@ def test_indexer_has_bloom_is_bool(self): """Test that has_bloom returns a bool""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() assert isinstance(indexer.has_bloom, bool) @@ -321,9 +359,9 @@ def test_indexer_has_manifest_is_bool(self): """Test that has_manifest returns a bool""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_manifest=True, index_threshold=0 + gz_file, index_path, build_manifest=True, index_threshold=0 ) as indexer: indexer.build() assert isinstance(indexer.has_manifest, bool) @@ -332,11 +370,11 @@ def test_indexer_custom_index_threshold(self): """Test that index_threshold is accepted without error""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) # A very large threshold skips bloom for small files with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, index_threshold=1024 * 1024 * 1024, ) as indexer: @@ -345,30 +383,29 @@ def test_indexer_custom_index_threshold(self): assert not indexer.has_bloom def test_indexer_bloom_persists_across_instances(self): - """Bloom data written to the sidecar is visible from a new Indexer""" + """Bloom data written to the index store is visible from a new Indexer""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() - # Open a fresh Indexer pointing at the same sidecar - with dft_utils.Indexer(gz_file, idx_file) as indexer2: + with dft_utils.Indexer(gz_file, index_path) as indexer2: assert indexer2.has_bloom def test_indexer_manifest_persists_across_instances(self): - """Manifest data written to the sidecar is visible from a new Indexer""" + """Manifest data written to the index store is visible from a new Indexer""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_manifest=True, index_threshold=0 + gz_file, index_path, build_manifest=True, index_threshold=0 ) as indexer: indexer.build() - with dft_utils.Indexer(gz_file, idx_file) as indexer2: + with dft_utils.Indexer(gz_file, index_path) as indexer2: assert indexer2.has_manifest @@ -379,9 +416,12 @@ def test_threshold_skips_bloom_for_small_file(self): """Explicit large threshold should skip bloom for small files""" with Environment(lines=5) as env: gz_file = env.create_test_gzip_file(bytes_per_line=128) - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=10 * 1024 * 1024 + gz_file, + index_path, + build_bloom=True, + index_threshold=10 * 1024 * 1024, ) as indexer: indexer.build() assert not indexer.has_bloom @@ -390,9 +430,12 @@ def test_threshold_skips_manifest_for_small_file(self): """Explicit large threshold should skip manifest for small files""" with Environment(lines=5) as env: gz_file = env.create_test_gzip_file(bytes_per_line=128) - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_manifest=True, index_threshold=10 * 1024 * 1024 + gz_file, + index_path, + build_manifest=True, + index_threshold=10 * 1024 * 1024, ) as indexer: indexer.build() assert not indexer.has_manifest @@ -401,10 +444,10 @@ def test_threshold_skips_bloom_and_manifest_for_small_file(self): """Explicit large threshold should skip bloom and manifest for small files""" with Environment(lines=5) as env: gz_file = env.create_test_gzip_file(bytes_per_line=128) - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, build_manifest=True, index_threshold=10 * 1024 * 1024, @@ -417,10 +460,10 @@ def test_explicit_large_threshold_skips_bloom(self): """Explicit large threshold should skip bloom for small files""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, index_threshold=1024 * 1024 * 1024, ) as indexer: @@ -431,9 +474,9 @@ def test_zero_threshold_forces_bloom(self): """index_threshold=0 disables threshold, bloom should be built""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() assert indexer.has_bloom @@ -442,9 +485,9 @@ def test_zero_threshold_forces_manifest(self): """index_threshold=0 disables threshold, manifest should be built""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_manifest=True, index_threshold=0 + gz_file, index_path, build_manifest=True, index_threshold=0 ) as indexer: indexer.build() assert indexer.has_manifest diff --git a/tests/python/test_reorganization_planner.py b/tests/python/test_reorganization_planner.py index 9ea8ee36..b5518fe6 100644 --- a/tests/python/test_reorganization_planner.py +++ b/tests/python/test_reorganization_planner.py @@ -17,10 +17,10 @@ class TestReorganizationPlannerUtility: def test_plan_returns_dict(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, build_manifest=True, index_threshold=0, @@ -37,10 +37,10 @@ def test_plan_returns_dict(self): def test_call_delegates_to_process(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, build_manifest=True, index_threshold=0, @@ -60,10 +60,10 @@ def test_plan_succeeds_without_manifest(self): """Without manifest the planner streams the file and succeeds.""" with Environment(lines=5) as env: gz_file = env.create_test_gzip_file(bytes_per_line=128) - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, build_manifest=True, index_threshold=_SKIP_INDEX_THRESHOLD, @@ -81,10 +81,10 @@ def test_plan_has_tasks_without_manifest(self): """Whole-file fallback produces extraction tasks.""" with Environment(lines=5) as env: gz_file = env.create_test_gzip_file(bytes_per_line=128) - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, build_manifest=True, index_threshold=_SKIP_INDEX_THRESHOLD, diff --git a/tests/python/test_statistics_aggregator.py b/tests/python/test_statistics_aggregator.py index 555c40ec..3be995d5 100644 --- a/tests/python/test_statistics_aggregator.py +++ b/tests/python/test_statistics_aggregator.py @@ -17,9 +17,9 @@ class TestStatisticsAggregatorUtility: def test_compute_returns_dict(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() result = StatisticsAggregatorUtility().process(gz_file) @@ -30,9 +30,9 @@ def test_compute_returns_dict(self): def test_compute_correct_event_count(self): with Environment(lines=30) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() result = StatisticsAggregatorUtility().process(gz_file) @@ -42,9 +42,9 @@ def test_compute_correct_event_count(self): def test_compute_has_statistics_fields(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() result = StatisticsAggregatorUtility().process(gz_file) @@ -57,9 +57,9 @@ def test_compute_has_statistics_fields(self): def test_call_delegates_to_process(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() util = StatisticsAggregatorUtility() @@ -75,10 +75,10 @@ def test_returns_dict_without_bloom(self): """Without bloom data the aggregator streams the file and succeeds.""" with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, index_threshold=_SKIP_INDEX_THRESHOLD, ) as indexer: @@ -94,10 +94,10 @@ def test_correct_event_count_without_bloom(self): """Sequential fallback produces the same event count as indexed path.""" with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, index_threshold=_SKIP_INDEX_THRESHOLD, ) as indexer: @@ -111,10 +111,10 @@ def test_has_statistics_fields_without_bloom(self): """Sequential fallback populates all statistics fields.""" with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, index_threshold=_SKIP_INDEX_THRESHOLD, ) as indexer: diff --git a/tests/python/test_statistics_query.py b/tests/python/test_statistics_query.py index a02cc9b6..fa2cb123 100644 --- a/tests/python/test_statistics_query.py +++ b/tests/python/test_statistics_query.py @@ -17,9 +17,9 @@ class TestStatisticsQueryUtility: def test_query_summary(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="summary") @@ -30,9 +30,9 @@ def test_query_summary(self): def test_query_categories(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="categories") @@ -42,9 +42,9 @@ def test_query_categories(self): def test_query_names(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="names") @@ -53,9 +53,9 @@ def test_query_names(self): def test_query_top_n_names(self): with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="top_n_names", top_n=5) @@ -65,9 +65,9 @@ def test_query_top_n_names(self): def test_query_duration_stats(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() result = StatisticsQueryUtility().process(gz_file, query_type="duration_stats") @@ -77,9 +77,9 @@ def test_query_duration_stats(self): def test_call_delegates_to_process(self): with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( - gz_file, idx_file, build_bloom=True, index_threshold=0 + gz_file, index_path, build_bloom=True, index_threshold=0 ) as indexer: indexer.build() util = StatisticsQueryUtility() @@ -95,10 +95,10 @@ def test_summary_correct_events_without_bloom(self): """Sequential fallback produces correct event count.""" with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, index_threshold=_SKIP_INDEX_THRESHOLD, ) as indexer: @@ -112,10 +112,10 @@ def test_categories_populated_without_bloom(self): """Sequential fallback populates categories.""" with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" + index_path = env.get_index_path(gz_file) with dft_utils.Indexer( gz_file, - idx_file, + index_path, build_bloom=True, index_threshold=_SKIP_INDEX_THRESHOLD, ) as indexer: diff --git a/tests/python/test_trace_reader.py b/tests/python/test_trace_reader.py index e86df1eb..586fbd7a 100644 --- a/tests/python/test_trace_reader.py +++ b/tests/python/test_trace_reader.py @@ -25,21 +25,21 @@ def test_creation_nonexistent_file(self): with pytest.raises(RuntimeError): reader.read_lines() - def test_has_index_false_without_sidecar(self): - """has_index is False when no .idx sidecar exists.""" + def test_has_index_false_without_index_store(self): + """has_index is False when no `.dftindex` store exists.""" with Environment() as env: gz_file = env.create_test_gzip_file() reader = dft_utils.TraceReader(gz_file) assert reader.has_index is False def test_has_index_true_after_indexer_build(self): - """has_index is True when a sidecar was built before construction.""" + """has_index is True when an index store was built before construction.""" with Environment() as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" - with dft_utils.Indexer(gz_file, idx_file) as indexer: + index_path = env.get_index_path(gz_file) + with dft_utils.Indexer(gz_file, index_path) as indexer: indexer.build() - # TraceReader probes for the sidecar at __init__ time + # TraceReader probes for the index store at __init__ time reader = dft_utils.TraceReader(gz_file) assert reader.has_index is True @@ -150,11 +150,11 @@ def test_read_lines_negative_end_raises(self): reader.read_lines(end_line=-1) def test_read_lines_with_index(self): - """read_lines() works correctly when a sidecar index is present.""" + """read_lines() works correctly when an index store is present.""" with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - idx_file = gz_file + ".idx" - with dft_utils.Indexer(gz_file, idx_file) as indexer: + index_path = env.get_index_path(gz_file) + with dft_utils.Indexer(gz_file, index_path) as indexer: indexer.build() reader = dft_utils.TraceReader(gz_file) assert reader.has_index @@ -169,8 +169,8 @@ def test_read_lines_indexed_matches_sequential(self): sequential = dft_utils.TraceReader(gz_file).read_lines() # Build index, then read again - idx_file = gz_file + ".idx" - with dft_utils.Indexer(gz_file, idx_file) as indexer: + index_path = env.get_index_path(gz_file) + with dft_utils.Indexer(gz_file, index_path) as indexer: indexer.build() indexed = dft_utils.TraceReader(gz_file).read_lines() diff --git a/tests/python/test_trace_reader_arrow.py b/tests/python/test_trace_reader_arrow.py index c90009d1..822554c5 100644 --- a/tests/python/test_trace_reader_arrow.py +++ b/tests/python/test_trace_reader_arrow.py @@ -14,8 +14,8 @@ def test_iter_arrow_returns_batches(self): """iter_arrow yields batch objects with __arrow_c_array__.""" with Environment(lines=50) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) - batches = list(reader.iter_arrow(batch_size=100)) + with dft_utils.TraceReader(gz_file) as reader: + batches = list(reader.iter_arrow(batch_size=100)) assert len(batches) >= 1 for b in batches: assert hasattr(b, "__arrow_c_array__") @@ -26,8 +26,8 @@ def test_iter_arrow_correct_row_count(self): """Total rows across all batches equals number of JSON lines.""" with Environment(lines=50) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) - batches = list(reader.iter_arrow(batch_size=20)) + with dft_utils.TraceReader(gz_file) as reader: + batches = list(reader.iter_arrow(batch_size=20)) total_rows = sum(b.num_rows for b in batches) assert total_rows == 50 @@ -35,9 +35,9 @@ def test_iter_arrow_batch_size_respected(self): """Each batch has at most batch_size rows.""" with Environment(lines=100) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) batch_size = 30 - batches = list(reader.iter_arrow(batch_size=batch_size)) + with dft_utils.TraceReader(gz_file) as reader: + batches = list(reader.iter_arrow(batch_size=batch_size)) for b in batches: assert b.num_rows <= batch_size @@ -45,8 +45,8 @@ def test_iter_arrow_discovers_columns(self): """Arrow batches have columns matching JSON keys.""" with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) - batches = list(reader.iter_arrow(batch_size=100)) + with dft_utils.TraceReader(gz_file) as reader: + batches = list(reader.iter_arrow(batch_size=100)) assert len(batches) == 1 b = batches[0] # Test data has: name, cat, dur, data @@ -56,10 +56,10 @@ def test_iter_arrow_clamped_range(self): """iter_arrow with out-of-range bytes clamps to actual bounds.""" with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) # Out-of-range start_byte is clamped to max, yielding empty # (for non-indexed files, clamping may read all data) - batches = list(reader.iter_arrow(start_byte=999999999, end_byte=999999999)) + with dft_utils.TraceReader(gz_file) as reader: + batches = list(reader.iter_arrow(start_byte=999999999, end_byte=999999999)) # Just verify it doesn't crash — clamping behavior varies assert isinstance(batches, list) @@ -68,11 +68,11 @@ def test_iter_arrow_with_line_range(self): with Environment(lines=50) as env: gz_file = env.create_test_gzip_file() # Build index for line-based access - idx_file = gz_file + ".idx" - with dft_utils.Indexer(gz_file, idx_file) as indexer: + index_path = env.get_index_path(gz_file) + with dft_utils.Indexer(gz_file, index_path) as indexer: indexer.build() - reader = dft_utils.TraceReader(gz_file) - batches = list(reader.iter_arrow(start_line=10, end_line=20, batch_size=100)) + with dft_utils.TraceReader(gz_file) as reader: + batches = list(reader.iter_arrow(start_line=10, end_line=20, batch_size=100)) total_rows = sum(b.num_rows for b in batches) # end_line is inclusive, so lines 10..20 = 11 lines assert total_rows == 11 @@ -85,24 +85,24 @@ def test_read_arrow_returns_arrow_table(self): """read_arrow returns an ArrowTable.""" with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) - table = reader.read_arrow(batch_size=100) + with dft_utils.TraceReader(gz_file) as reader: + table = reader.read_arrow(batch_size=100) assert isinstance(table, ArrowTable) def test_read_arrow_row_count(self): """ArrowTable has correct total row count.""" with Environment(lines=30) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) - table = reader.read_arrow(batch_size=100) + with dft_utils.TraceReader(gz_file) as reader: + table = reader.read_arrow(batch_size=100) assert table.num_rows == 30 def test_read_arrow_batch_access(self): """ArrowTable provides batch access.""" with Environment(lines=50) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) - table = reader.read_arrow(batch_size=20) + with dft_utils.TraceReader(gz_file) as reader: + table = reader.read_arrow(batch_size=20) assert table.num_batches >= 1 for b in table.batches(): # Batches are raw _ArrowBatchCapsule objects (not ArrowBatch wrappers) @@ -113,8 +113,8 @@ def test_read_arrow_properties(self): """ArrowTable exposes num_batches, num_rows, empty.""" with Environment(lines=20) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) - table = reader.read_arrow(batch_size=100) + with dft_utils.TraceReader(gz_file) as reader: + table = reader.read_arrow(batch_size=100) assert table.num_rows == 20 assert table.num_batches >= 1 assert not table.empty @@ -127,8 +127,8 @@ def test_arrow_batch_wraps_capsule(self): """ArrowBatch wraps a capsule from iter_arrow.""" with Environment(lines=10) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) - raw_batches = list(reader.iter_arrow(batch_size=100)) + with dft_utils.TraceReader(gz_file) as reader: + raw_batches = list(reader.iter_arrow(batch_size=100)) assert len(raw_batches) >= 1 batch = ArrowBatch(raw_batches[0]) assert hasattr(batch, "__arrow_c_array__") @@ -138,8 +138,8 @@ def test_arrow_batch_to_pandas_requires_pyarrow(self): """to_pandas raises ImportError if pyarrow is not installed.""" with Environment(lines=5) as env: gz_file = env.create_test_gzip_file() - reader = dft_utils.TraceReader(gz_file) - raw_batches = list(reader.iter_arrow(batch_size=100)) + with dft_utils.TraceReader(gz_file) as reader: + raw_batches = list(reader.iter_arrow(batch_size=100)) batch = ArrowBatch(raw_batches[0]) # This test only verifies the method exists; actual conversion # depends on pyarrow being installed diff --git a/tests/reader/test_basic_factory.cpp b/tests/reader/test_basic_factory.cpp index 18a94d02..364997c9 100644 --- a/tests/reader/test_basic_factory.cpp +++ b/tests/reader/test_basic_factory.cpp @@ -1,4 +1,5 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include #include #include #include @@ -6,6 +7,7 @@ #include "testing_utilities.h" using namespace dftracer::utils; +using namespace dftracer::utils::utilities::composites::dft::internal; using namespace dftracer::utils::utilities::indexer::internal; using namespace dftracer::utils::utilities::reader::internal; using namespace dft_utils_test; @@ -18,12 +20,13 @@ TEST_CASE("Factory Pattern - Basic GZIP functionality") { REQUIRE(!gz_file.empty()); std::string idx_file = env.get_index_path(gz_file); + std::string db_root = determine_index_path(gz_file, ""); SUBCASE("IndexerFactory creates valid indexer") { auto indexer = IndexerFactory::create(gz_file, idx_file, 1024 * 1024); REQUIRE(indexer != nullptr); CHECK(indexer->get_archive_path() == gz_file); - CHECK(indexer->get_idx_path() == idx_file); + CHECK(indexer->get_index_path() == db_root); } SUBCASE("ReaderFactory creates valid reader") { @@ -35,7 +38,7 @@ TEST_CASE("Factory Pattern - Basic GZIP functionality") { REQUIRE(reader != nullptr); CHECK(reader->is_valid()); CHECK(reader->get_archive_path() == gz_file); - CHECK(reader->get_idx_path() == idx_file); + CHECK(reader->get_index_path() == db_root); } SUBCASE("Reader factory from files") { @@ -59,13 +62,14 @@ TEST_CASE("Factory Pattern - Basic TAR.GZ functionality") { REQUIRE(!tar_gz_file.empty()); std::string idx_file = env.get_index_path(tar_gz_file); + std::string db_root = determine_index_path(tar_gz_file, ""); SUBCASE("IndexerFactory creates valid TAR.GZ indexer") { auto indexer = IndexerFactory::create(tar_gz_file, idx_file, 1024 * 1024); REQUIRE(indexer != nullptr); CHECK(indexer->get_archive_path() == tar_gz_file); - CHECK(indexer->get_idx_path() == idx_file); + CHECK(indexer->get_index_path() == db_root); } SUBCASE("ReaderFactory creates valid TAR.GZ reader") { @@ -78,7 +82,7 @@ TEST_CASE("Factory Pattern - Basic TAR.GZ functionality") { REQUIRE(reader != nullptr); CHECK(reader->is_valid()); CHECK(reader->get_archive_path() == tar_gz_file); - CHECK(reader->get_idx_path() == idx_file); + CHECK(reader->get_index_path() == db_root); } } diff --git a/tests/reader/test_reader.c b/tests/reader/test_reader.c index f6e7d607..79b32684 100644 --- a/tests/reader/test_reader.c +++ b/tests/reader/test_reader.c @@ -70,7 +70,7 @@ void test_indexer_invalid_parameters(void) { indexer = dft_indexer_create(NULL, "test.idx", mb_to_b(1.0), 0); TEST_ASSERT_NULL(indexer); - // Test null idx_path + // Test null index_path indexer = dft_indexer_create("test.gz", NULL, mb_to_b(1.0), 0); TEST_ASSERT_NULL(indexer); @@ -184,7 +184,7 @@ void test_reader_invalid_parameters(void) { reader = dft_reader_create(NULL, "test.idx", ckpt_size); TEST_ASSERT_NULL(reader); - // Test null idx_path + // Test null index_path reader = dft_reader_create("test.gz", NULL, ckpt_size); TEST_ASSERT_NULL(reader); diff --git a/tests/reader/test_reader.cpp b/tests/reader/test_reader.cpp index e4423481..cc5153ed 100644 --- a/tests/reader/test_reader.cpp +++ b/tests/reader/test_reader.cpp @@ -53,7 +53,7 @@ TEST_CASE("C++ Indexer - Basic functionality") { // Test getter methods CHECK(indexer->get_archive_path() == gz_file); - CHECK(indexer->get_idx_path() == idx_file); + CHECK(indexer->get_index_path() == idx_file); // Build index first before accessing metadata indexer->build(); @@ -119,7 +119,7 @@ TEST_CASE("C++ Reader - Basic functionality") { // Test getter methods CHECK(reader->get_archive_path() == gz_file); - CHECK(reader->get_idx_path() == idx_file); + CHECK(reader->get_index_path() == idx_file); } SUBCASE("Read byte range using streaming API") { diff --git a/tests/reader/test_reader_formats.cpp b/tests/reader/test_reader_formats.cpp index 440229ca..99703384 100644 --- a/tests/reader/test_reader_formats.cpp +++ b/tests/reader/test_reader_formats.cpp @@ -76,7 +76,8 @@ TEST_CASE_TEMPLATE("Indexer creation and destruction", FormatType, GZIPFormat, auto indexer = IndexerFactory::create( fixture.get_test_file(), fixture.get_index_file(), 1024 * 1024); REQUIRE(indexer != nullptr); - CHECK(indexer->exists()); + CHECK_FALSE(indexer->exists()); + CHECK(indexer->need_rebuild()); } SUBCASE("Invalid file path") { diff --git a/tests/reader/test_reader_stream.cpp b/tests/reader/test_reader_stream.cpp index 31af79dc..111c62fb 100644 --- a/tests/reader/test_reader_stream.cpp +++ b/tests/reader/test_reader_stream.cpp @@ -622,7 +622,7 @@ TEST_CASE("C++ Reader Streaming API - Format identification") { SUBCASE("Verify metadata access") { CHECK(reader->get_archive_path() == gz_file); - CHECK(reader->get_idx_path() == idx_file); + CHECK(reader->get_index_path() == idx_file); CHECK(reader->get_max_bytes() > 0); } } diff --git a/tests/reader/test_reader_tar_comprehensive.cpp b/tests/reader/test_reader_tar_comprehensive.cpp index 2629f86c..4cab7477 100644 --- a/tests/reader/test_reader_tar_comprehensive.cpp +++ b/tests/reader/test_reader_tar_comprehensive.cpp @@ -1,5 +1,6 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include +#include #include #include #include @@ -17,6 +18,7 @@ #include "testing_utilities.h" using namespace dftracer::utils; +using namespace dftracer::utils::utilities::composites::dft::internal; using namespace dftracer::utils::utilities::indexer::internal; using namespace dftracer::utils::utilities::reader::internal; using namespace dft_utils_test; @@ -29,6 +31,7 @@ TEST_CASE("TAR.GZ Indexer - Basic functionality") { REQUIRE(!tar_gz_file.empty()); std::string idx_file = env.get_index_path(tar_gz_file); + std::string db_root = determine_index_path(tar_gz_file, ""); SUBCASE("Build index") { auto indexer = @@ -55,7 +58,7 @@ TEST_CASE("TAR.GZ Indexer - Basic functionality") { // Test getter methods CHECK(indexer->get_archive_path() == tar_gz_file); - CHECK(indexer->get_idx_path() == idx_file); + CHECK(indexer->get_index_path() == db_root); // Build index first before accessing metadata indexer->build(); @@ -86,6 +89,7 @@ TEST_CASE("TAR.GZ Reader - Basic functionality") { REQUIRE(!tar_gz_file.empty()); std::string idx_file = env.get_index_path(tar_gz_file); + std::string db_root = determine_index_path(tar_gz_file, ""); // Build index first { @@ -123,7 +127,7 @@ TEST_CASE("TAR.GZ Reader - Basic functionality") { // Test getter methods CHECK(reader->get_archive_path() == tar_gz_file); - CHECK(reader->get_idx_path() == idx_file); + CHECK(reader->get_index_path() == db_root); } SUBCASE("Read byte range using streaming API") { diff --git a/tests/testing_utilities.cpp b/tests/testing_utilities.cpp index ed4a0ab2..0b4665ad 100644 --- a/tests/testing_utilities.cpp +++ b/tests/testing_utilities.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -246,7 +247,6 @@ std::string TestEnvironment::create_test_gzip_file_impl() { // Create test file in the unique directory std::string gz_file = test_dir + "/test_data.gz"; - std::string idx_file = test_dir + "/test_data.gz.idx"; std::string txt_file = test_dir + "/test_data.txt"; // Write test data to text file @@ -333,7 +333,8 @@ std::string TestEnvironment::create_test_tar_gzip_file_impl() { } std::string TestEnvironment::get_index_path(const std::string& gz_file) { - return gz_file + ".idx"; + return dftracer::utils::utilities::composites::dft::internal:: + determine_index_path(gz_file, ""); } std::string TestEnvironment::create_dft_test_file(int num_events) { @@ -465,10 +466,10 @@ char* test_environment_get_index_path(test_environment_handle_t env, const char* gz_file) { if (!env || !gz_file) return nullptr; auto* cpp_env = reinterpret_cast(env); - std::string idx_path = cpp_env->get_index_path(gz_file); - char* result = static_cast(malloc(idx_path.length() + 1)); + std::string index_path = cpp_env->get_index_path(gz_file); + char* result = static_cast(malloc(index_path.length() + 1)); if (result) { - strcpy(result, idx_path.c_str()); + strcpy(result, index_path.c_str()); } return result; } diff --git a/tests/testing_utilities.h b/tests/testing_utilities.h index 002589ef..35445d88 100644 --- a/tests/testing_utilities.h +++ b/tests/testing_utilities.h @@ -93,7 +93,7 @@ char** get_tar_file_list(const char* tar_path, size_t* num_files); void free_tar_file_list(char** file_list, size_t num_files); /** - * Get index path for a given gzip file + * Get the `.dftindex` path for a given gzip file * Returns allocated string - caller must free */ char* test_environment_get_index_path(test_environment_handle_t env, diff --git a/tests/utilities/CMakeLists.txt b/tests/utilities/CMakeLists.txt index bf2f6a76..1818992b 100644 --- a/tests/utilities/CMakeLists.txt +++ b/tests/utilities/CMakeLists.txt @@ -77,6 +77,8 @@ set(UTILITIES_TEST_SOURCES composites/dft/comparator/test_is_data_transfer_op.cpp # Indexer + indexer/test_rocksdb_storage.cpp + indexer/test_scan_prefix.cpp indexer/test_index_database.cpp indexer/test_provenance_database.cpp indexer/test_index_builder.cpp diff --git a/tests/utilities/composites/dft/indexing/test_bloom_query.cpp b/tests/utilities/composites/dft/indexing/test_bloom_query.cpp index b32e91de..c27c9b96 100644 --- a/tests/utilities/composites/dft/indexing/test_bloom_query.cpp +++ b/tests/utilities/composites/dft/indexing/test_bloom_query.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include @@ -17,9 +16,9 @@ using dftracer::utils::utilities::indexer::IndexDatabase; using dftracer::utils::utilities::indexer::internal::get_logical_path; // Helper to set up a .idx database with test data -static void populate_test_idx(const std::string& idx_path, +static void populate_test_idx(const std::string& index_path, const std::string& file_path) { - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(index_path); idx_db.init_base_schema(); idx_db.init_bloom_schema(); @@ -44,10 +43,9 @@ static void populate_test_idx(const std::string& idx_path, } auto blob = name_bloom.serialize(); - queries::insert_chunk_bloom_filter( - idx_db.sql_db(), fid, static_cast(ckpt), "name", - blob.data(), static_cast(blob.size()), - name_bloom.num_entries()); + idx_db.insert_chunk_bloom_filter( + fid, static_cast(ckpt), "name", blob.data(), + static_cast(blob.size()), name_bloom.num_entries()); // cat dimension BloomFilter cat_bloom(100, 0.01); @@ -58,10 +56,9 @@ static void populate_test_idx(const std::string& idx_path, } auto cat_blob = cat_bloom.serialize(); - queries::insert_chunk_bloom_filter( - idx_db.sql_db(), fid, static_cast(ckpt), "cat", - cat_blob.data(), static_cast(cat_blob.size()), - cat_bloom.num_entries()); + idx_db.insert_chunk_bloom_filter( + fid, static_cast(ckpt), "cat", cat_blob.data(), + static_cast(cat_blob.size()), cat_bloom.num_entries()); } // Create file-level bloom filters (merged from all chunks) @@ -72,42 +69,40 @@ static void populate_test_idx(const std::string& idx_path, file_name_bloom.add("close"); file_name_bloom.add("stat"); auto name_blob = file_name_bloom.serialize(); - queries::insert_file_bloom_filter( - idx_db.sql_db(), fid, "name", name_blob.data(), - static_cast(name_blob.size()), file_name_bloom.num_entries()); + idx_db.insert_file_bloom_filter(fid, "name", name_blob.data(), + static_cast(name_blob.size()), + file_name_bloom.num_entries()); BloomFilter file_cat_bloom(100, 0.01); file_cat_bloom.add("POSIX"); file_cat_bloom.add("storage"); auto cat_blob = file_cat_bloom.serialize(); - queries::insert_file_bloom_filter( - idx_db.sql_db(), fid, "cat", cat_blob.data(), - static_cast(cat_blob.size()), file_cat_bloom.num_entries()); + idx_db.insert_file_bloom_filter(fid, "cat", cat_blob.data(), + static_cast(cat_blob.size()), + file_cat_bloom.num_entries()); // Add fhash with resolution BloomFilter fhash_bloom(100, 0.01); fhash_bloom.add("abc123"); auto fhash_blob = fhash_bloom.serialize(); - queries::insert_file_bloom_filter( - idx_db.sql_db(), fid, "fhash", fhash_blob.data(), - static_cast(fhash_blob.size()), fhash_bloom.num_entries()); + idx_db.insert_file_bloom_filter(fid, "fhash", fhash_blob.data(), + static_cast(fhash_blob.size()), + fhash_bloom.num_entries()); for (int ckpt = 0; ckpt < 3; ++ckpt) { auto blob = fhash_bloom.serialize(); - queries::insert_chunk_bloom_filter( - idx_db.sql_db(), fid, static_cast(ckpt), "fhash", - blob.data(), static_cast(blob.size()), - fhash_bloom.num_entries()); + idx_db.insert_chunk_bloom_filter( + fid, static_cast(ckpt), "fhash", blob.data(), + static_cast(blob.size()), fhash_bloom.num_entries()); } // Hash resolutions - queries::insert_hash_resolution(idx_db.sql_db(), fid, "fhash", "abc123", - "./data/file.h5"); + idx_db.insert_hash_resolution(fid, "fhash", "abc123", "./data/file.h5"); // Record dimensions - queries::insert_index_dimension(idx_db.sql_db(), fid, "name"); - queries::insert_index_dimension(idx_db.sql_db(), fid, "cat"); - queries::insert_index_dimension(idx_db.sql_db(), fid, "fhash"); + idx_db.insert_index_dimension(fid, "name"); + idx_db.insert_index_dimension(fid, "cat"); + idx_db.insert_index_dimension(fid, "fhash"); idx_db.commit_transaction(); } @@ -118,13 +113,14 @@ TEST_SUITE("BloomQueryUtility") { dft_utils_test::make_unique_test_path("test_bloom_query").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); BloomQueryInput input; - input.with_idx_path(idx_path).with_file_path(file_path).with_predicate( - "name", {"nonexistent_operation"}); + input.with_index_path(index_path) + .with_file_path(file_path) + .with_predicate("name", {"nonexistent_operation"}); BloomQueryUtility query; auto output = query.process(input).get(); @@ -142,13 +138,14 @@ TEST_SUITE("BloomQueryUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); BloomQueryInput input; - input.with_idx_path(idx_path).with_file_path(file_path).with_predicate( - "name", {"read"}); + input.with_index_path(index_path) + .with_file_path(file_path) + .with_predicate("name", {"read"}); BloomQueryUtility query; auto output = query.process(input).get(); @@ -168,12 +165,12 @@ TEST_SUITE("BloomQueryUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); BloomQueryInput input; - input.with_idx_path(idx_path) + input.with_index_path(index_path) .with_file_path(file_path) .with_predicate("name", {"open"}) .with_predicate("cat", {"storage"}); @@ -196,12 +193,12 @@ TEST_SUITE("BloomQueryUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); BloomQueryInput input; - input.with_idx_path(idx_path).with_file_path(file_path); + input.with_index_path(index_path).with_file_path(file_path); BloomQueryUtility query; auto output = query.process(input).get(); @@ -218,14 +215,15 @@ TEST_SUITE("BloomQueryUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); // Query by resolved value (not hash) BloomQueryInput input; - input.with_idx_path(idx_path).with_file_path(file_path).with_predicate( - "fhash", {"./data/file.h5"}); + input.with_index_path(index_path) + .with_file_path(file_path) + .with_predicate("fhash", {"./data/file.h5"}); BloomQueryUtility query; auto output = query.process(input).get(); @@ -244,13 +242,14 @@ TEST_SUITE("BloomQueryUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); BloomQueryInput input; - input.with_idx_path(idx_path).with_file_path(file_path).with_predicate( - "name", {"read", "open"}); + input.with_index_path(index_path) + .with_file_path(file_path) + .with_predicate("name", {"read", "open"}); BloomQueryUtility query; auto output = query.process(input).get(); diff --git a/tests/utilities/composites/dft/indexing/test_chunk_indexer.cpp b/tests/utilities/composites/dft/indexing/test_chunk_indexer.cpp index 6b5b565d..fafeb5c5 100644 --- a/tests/utilities/composites/dft/indexing/test_chunk_indexer.cpp +++ b/tests/utilities/composites/dft/indexing/test_chunk_indexer.cpp @@ -70,7 +70,7 @@ TEST_SUITE("ChunkIndexerUtility") { ChunkIndexerInput input; input.with_file_path(trace_file) - .with_idx_path("") + .with_index_path("") .with_checkpoint_size(uncompressed_size) .with_checkpoint_idx(0) .with_byte_range(0, uncompressed_size) @@ -140,7 +140,7 @@ TEST_SUITE("ChunkIndexerUtility") { ChunkIndexerInput input; input.with_file_path(trace_file) - .with_idx_path("") + .with_index_path("") .with_checkpoint_size(uncompressed_size) .with_checkpoint_idx(0) .with_byte_range(0, uncompressed_size) @@ -197,7 +197,7 @@ TEST_SUITE("ChunkIndexerUtility") { ChunkIndexerInput input; input.with_file_path(gz_path) - .with_idx_path("") + .with_index_path("") .with_checkpoint_size(uncompressed_size) .with_checkpoint_idx(0) .with_byte_range(0, uncompressed_size) diff --git a/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp b/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp index 493c8652..5a606ad8 100644 --- a/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp +++ b/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -14,14 +13,13 @@ using namespace dftracer::utils; using namespace dftracer::utils::utilities::composites::dft::indexing; -using namespace dftracer::utils::utilities::composites::dft::indexing::queries; using dftracer::utils::utilities::common::query::Query; using dftracer::utils::utilities::indexer::IndexDatabase; using dftracer::utils::utilities::indexer::internal::get_logical_path; -static void populate_test_idx(const std::string& idx_path, +static void populate_test_idx(const std::string& index_path, const std::string& file_path) { - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(index_path); idx_db.init_base_schema(); idx_db.init_bloom_schema(); @@ -37,25 +35,25 @@ static void populate_test_idx(const std::string& idx_path, cat_ds.value_type = "string"; cat_ds.observe("POSIX"); cat_ds.observe("POSIX"); - insert_chunk_dimension_stats(idx_db.sql_db(), fid, 0, cat_ds); + idx_db.insert_chunk_dimension_stats(fid, 0, cat_ds); ChunkDimensionStats name_ds; name_ds.dimension = "name"; name_ds.value_type = "string"; name_ds.observe("read"); name_ds.observe("read"); - insert_chunk_dimension_stats(idx_db.sql_db(), fid, 0, name_ds); + idx_db.insert_chunk_dimension_stats(fid, 0, name_ds); ChunkDimensionStats dur_ds; dur_ds.dimension = "dur"; dur_ds.value_type = "uint"; dur_ds.observe("100"); dur_ds.observe("200"); - insert_chunk_dimension_stats(idx_db.sql_db(), fid, 0, dur_ds); + idx_db.insert_chunk_dimension_stats(fid, 0, dur_ds); - insert_index_dimension(idx_db.sql_db(), fid, "cat"); - insert_index_dimension(idx_db.sql_db(), fid, "name"); - insert_index_dimension(idx_db.sql_db(), fid, "dur"); + idx_db.insert_index_dimension(fid, "cat"); + idx_db.insert_index_dimension(fid, "name"); + idx_db.insert_index_dimension(fid, "dur"); } // Chunk 1: STDIO writes, dur 500-600 @@ -64,20 +62,20 @@ static void populate_test_idx(const std::string& idx_path, cat_ds.dimension = "cat"; cat_ds.value_type = "string"; cat_ds.observe("STDIO"); - insert_chunk_dimension_stats(idx_db.sql_db(), fid, 1, cat_ds); + idx_db.insert_chunk_dimension_stats(fid, 1, cat_ds); ChunkDimensionStats name_ds; name_ds.dimension = "name"; name_ds.value_type = "string"; name_ds.observe("write"); - insert_chunk_dimension_stats(idx_db.sql_db(), fid, 1, name_ds); + idx_db.insert_chunk_dimension_stats(fid, 1, name_ds); ChunkDimensionStats dur_ds; dur_ds.dimension = "dur"; dur_ds.value_type = "uint"; dur_ds.observe("500"); dur_ds.observe("600"); - insert_chunk_dimension_stats(idx_db.sql_db(), fid, 1, dur_ds); + idx_db.insert_chunk_dimension_stats(fid, 1, dur_ds); } // Chunk 2: POSIX + MPI mixed, dur 50-1000 @@ -87,33 +85,33 @@ static void populate_test_idx(const std::string& idx_path, cat_ds.value_type = "string"; cat_ds.observe("POSIX"); cat_ds.observe("MPI"); - insert_chunk_dimension_stats(idx_db.sql_db(), fid, 2, cat_ds); + idx_db.insert_chunk_dimension_stats(fid, 2, cat_ds); ChunkDimensionStats name_ds; name_ds.dimension = "name"; name_ds.value_type = "string"; name_ds.observe("read"); name_ds.observe("send"); - insert_chunk_dimension_stats(idx_db.sql_db(), fid, 2, name_ds); + idx_db.insert_chunk_dimension_stats(fid, 2, name_ds); ChunkDimensionStats dur_ds; dur_ds.dimension = "dur"; dur_ds.value_type = "uint"; dur_ds.observe("50"); dur_ds.observe("1000"); - insert_chunk_dimension_stats(idx_db.sql_db(), fid, 2, dur_ds); + idx_db.insert_chunk_dimension_stats(fid, 2, dur_ds); } idx_db.commit_transaction(); } -static ChunkPrunerOutput run_pruner(const std::string& idx_path, +static ChunkPrunerOutput run_pruner(const std::string& index_path, const std::string& file_path, const char* query_str) { auto q = Query::from_string(query_str); REQUIRE(q.has_value()); - ChunkPrunerInput input{idx_path, file_path, std::move(*q), nullptr}; + ChunkPrunerInput input{index_path, file_path, std::move(*q), nullptr}; ChunkPrunerUtility pruner; return pruner.process(input).get(); @@ -124,11 +122,11 @@ TEST_SUITE("ChunkPrunerUtility") { std::string test_dir = dft_utils_test::make_unique_test_path("test_pruner_eq").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); - auto out = run_pruner(idx_path, file_path, R"(cat == "POSIX")"); + auto out = run_pruner(index_path, file_path, R"(cat == "POSIX")"); CHECK(out.success); CHECK(out.total_checkpoints == 3); // Chunks 0 and 2 have POSIX, chunk 1 has only STDIO @@ -142,11 +140,11 @@ TEST_SUITE("ChunkPrunerUtility") { dft_utils_test::make_unique_test_path("test_pruner_eq_none") .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); - auto out = run_pruner(idx_path, file_path, R"(cat == "HDF5")"); + auto out = run_pruner(index_path, file_path, R"(cat == "HDF5")"); CHECK(out.success); CHECK(out.candidate_checkpoints.empty()); CHECK_FALSE(out.file_may_match); @@ -156,12 +154,12 @@ TEST_SUITE("ChunkPrunerUtility") { std::string test_dir = dft_utils_test::make_unique_test_path("test_pruner_in").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); auto out = - run_pruner(idx_path, file_path, R"(cat in ["POSIX", "STDIO"])"); + run_pruner(index_path, file_path, R"(cat in ["POSIX", "STDIO"])"); CHECK(out.success); CHECK(out.candidate_checkpoints.size() == 3); } @@ -170,14 +168,14 @@ TEST_SUITE("ChunkPrunerUtility") { std::string test_dir = dft_utils_test::make_unique_test_path("test_pruner_notin").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); // Chunk 0: only POSIX → excluded by not in ["POSIX"] // Chunk 1: only STDIO → kept // Chunk 2: POSIX + MPI → MPI not in list → kept - auto out = run_pruner(idx_path, file_path, R"(cat not in ["POSIX"])"); + auto out = run_pruner(index_path, file_path, R"(cat not in ["POSIX"])"); CHECK(out.success); CHECK(out.candidate_checkpoints.size() == 2); CHECK(out.candidate_checkpoints[0] == 1); @@ -188,14 +186,14 @@ TEST_SUITE("ChunkPrunerUtility") { std::string test_dir = dft_utils_test::make_unique_test_path("test_pruner_and").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); // cat == "POSIX" → chunks 0, 2 // name == "read" → chunks 0, 2 // AND → chunks 0, 2 - auto out = run_pruner(idx_path, file_path, + auto out = run_pruner(index_path, file_path, R"(cat == "POSIX" and name == "read")"); CHECK(out.success); CHECK(out.candidate_checkpoints.size() == 2); @@ -205,14 +203,14 @@ TEST_SUITE("ChunkPrunerUtility") { std::string test_dir = dft_utils_test::make_unique_test_path("test_pruner_and2").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); // cat == "POSIX" → chunks 0, 2 // name == "send" → chunk 2 only // AND → chunk 2 - auto out = run_pruner(idx_path, file_path, + auto out = run_pruner(index_path, file_path, R"(cat == "POSIX" and name == "send")"); CHECK(out.success); CHECK(out.candidate_checkpoints.size() == 1); @@ -223,14 +221,14 @@ TEST_SUITE("ChunkPrunerUtility") { std::string test_dir = dft_utils_test::make_unique_test_path("test_pruner_or").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); // cat == "STDIO" → chunk 1 // name == "send" → chunk 2 // OR → chunks 1, 2 - auto out = run_pruner(idx_path, file_path, + auto out = run_pruner(index_path, file_path, R"(cat == "STDIO" or name == "send")"); CHECK(out.success); CHECK(out.candidate_checkpoints.size() == 2); @@ -242,13 +240,13 @@ TEST_SUITE("ChunkPrunerUtility") { std::string test_dir = dft_utils_test::make_unique_test_path("test_pruner_not").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); // cat == "STDIO" → chunk 1 // NOT → chunks 0, 2 - auto out = run_pruner(idx_path, file_path, R"(not cat == "STDIO")"); + auto out = run_pruner(index_path, file_path, R"(not cat == "STDIO")"); CHECK(out.success); CHECK(out.candidate_checkpoints.size() == 2); CHECK(out.candidate_checkpoints[0] == 0); @@ -259,13 +257,13 @@ TEST_SUITE("ChunkPrunerUtility") { std::string test_dir = dft_utils_test::make_unique_test_path("test_pruner_range").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); // dur > "500": chunk 0 max=200 (skip), chunk 1 max=600 (keep), // chunk 2 max=1000 (keep) - auto out = run_pruner(idx_path, file_path, R"(dur > "500")"); + auto out = run_pruner(index_path, file_path, R"(dur > "500")"); CHECK(out.success); CHECK(out.candidate_checkpoints.size() == 2); CHECK(out.candidate_checkpoints[0] == 1); @@ -276,11 +274,11 @@ TEST_SUITE("ChunkPrunerUtility") { std::string test_dir = dft_utils_test::make_unique_test_path("test_pruner_case").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); - auto out = run_pruner(idx_path, file_path, + auto out = run_pruner(index_path, file_path, R"(cat == "POSIX" AND name == "send")"); CHECK(out.success); CHECK(out.candidate_checkpoints.size() == 1); diff --git a/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp b/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp index d6354c82..3761d6a2 100644 --- a/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp +++ b/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp @@ -94,15 +94,15 @@ TEST_SUITE("ManifestIndexBuilder") { CHECK(result.success == true); CHECK(result.total_lines > 0); - CHECK(fs::exists(result.idx_path)); + CHECK(fs::exists(result.index_path)); - IndexDatabase idx_db(result.idx_path); + IndexDatabase idx_db(result.index_path); idx_db.init_base_schema(); idx_db.init_manifest_schema(); int fid = idx_db.get_file_info_id(get_logical_path(trace_file)); REQUIRE(fid >= 0); - auto event_ranges = queries::query_event_ranges(idx_db.sql_db(), fid); + auto event_ranges = idx_db.query_event_ranges(fid); CHECK(event_ranges.size() == 3); bool found_posix_read = false; @@ -117,7 +117,7 @@ TEST_SUITE("ManifestIndexBuilder") { } CHECK(found_posix_read); - auto metadata = queries::query_metadata_lines(idx_db.sql_db(), fid); + auto metadata = idx_db.query_metadata_lines(fid); CHECK(metadata.size() == 2); fs::remove_all(test_dir); diff --git a/tests/utilities/composites/dft/indexing/test_manifest_indexer.cpp b/tests/utilities/composites/dft/indexing/test_manifest_indexer.cpp index a2d01bc3..84d6e68a 100644 --- a/tests/utilities/composites/dft/indexing/test_manifest_indexer.cpp +++ b/tests/utilities/composites/dft/indexing/test_manifest_indexer.cpp @@ -68,7 +68,7 @@ TEST_SUITE("ManifestIndexer") { ChunkIndexerInput input; input.with_file_path(trace_file) - .with_idx_path("") + .with_index_path("") .with_checkpoint_size(uncompressed_size) .with_checkpoint_idx(0) .with_byte_range(0, uncompressed_size) @@ -133,7 +133,7 @@ TEST_SUITE("ManifestIndexer") { ChunkIndexerInput input; input.with_file_path(trace_file) - .with_idx_path("") + .with_index_path("") .with_checkpoint_size(uncompressed_size) .with_checkpoint_idx(0) .with_byte_range(0, uncompressed_size) @@ -191,7 +191,7 @@ TEST_SUITE("ManifestIndexer") { ChunkIndexerInput input; input.with_file_path(trace_file) - .with_idx_path("") + .with_index_path("") .with_checkpoint_size(uncompressed_size) .with_checkpoint_idx(0) .with_byte_range(0, uncompressed_size) diff --git a/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp b/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp index e7ac88f3..b7450e18 100644 --- a/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp +++ b/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp @@ -10,7 +10,8 @@ #include "testing_utilities.h" using namespace dftracer::utils; -using namespace dftracer::utils::utilities::composites::dft::indexing; +namespace queries = + dftracer::utils::utilities::composites::dft::indexing::queries; using dftracer::utils::utilities::indexer::IndexDatabase; using dftracer::utils::utilities::indexer::internal::get_logical_path; @@ -29,9 +30,9 @@ TEST_SUITE("ManifestQueries") { dft_utils_test::make_unique_test_path("test_manifest_queries") .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(index_path); idx_db.init_base_schema(); idx_db.init_manifest_schema(); int fid = @@ -39,26 +40,20 @@ TEST_SUITE("ManifestQueries") { idx_db.begin_transaction(); - queries::insert_event_range(idx_db.sql_db(), fid, 0, "POSIX", "read", - {0, 2, 5}); - queries::insert_event_range(idx_db.sql_db(), fid, 0, "POSIX", "write", - {1}); - queries::insert_event_range(idx_db.sql_db(), fid, 0, "APP", "compute", - {3, 4}); - queries::insert_event_range(idx_db.sql_db(), fid, 1, "POSIX", "read", - {0, 1}); + idx_db.insert_event_range(fid, 0, "POSIX", "read", {0, 2, 5}); + idx_db.insert_event_range(fid, 0, "POSIX", "write", {1}); + idx_db.insert_event_range(fid, 0, "APP", "compute", {3, 4}); + idx_db.insert_event_range(fid, 1, "POSIX", "read", {0, 1}); idx_db.commit_transaction(); - auto all = queries::query_event_ranges(idx_db.sql_db(), fid); + auto all = idx_db.query_event_ranges(fid); CHECK(all.size() == 4); - auto ckpt0 = - queries::query_event_ranges_for_checkpoint(idx_db.sql_db(), fid, 0); + auto ckpt0 = idx_db.query_event_ranges_for_checkpoint(fid, 0); CHECK(ckpt0.size() == 3); - auto ckpt1 = - queries::query_event_ranges_for_checkpoint(idx_db.sql_db(), fid, 1); + auto ckpt1 = idx_db.query_event_ranges_for_checkpoint(fid, 1); CHECK(ckpt1.size() == 1); CHECK(ckpt1[0].cat == "POSIX"); CHECK(ckpt1[0].name == "read"); @@ -73,9 +68,9 @@ TEST_SUITE("ManifestQueries") { dft_utils_test::make_unique_test_path("test_manifest_meta_q") .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(index_path); idx_db.init_base_schema(); idx_db.init_manifest_schema(); int fid = @@ -83,21 +78,19 @@ TEST_SUITE("ManifestQueries") { idx_db.begin_transaction(); - queries::insert_metadata_lines(idx_db.sql_db(), fid, 0, "HH", {0, 3}); - queries::insert_metadata_lines(idx_db.sql_db(), fid, 0, "FH", {1}); - queries::insert_metadata_lines(idx_db.sql_db(), fid, 1, "HH", {0}); + idx_db.insert_metadata_lines(fid, 0, "HH", {0, 3}); + idx_db.insert_metadata_lines(fid, 0, "FH", {1}); + idx_db.insert_metadata_lines(fid, 1, "HH", {0}); idx_db.commit_transaction(); - auto all = queries::query_metadata_lines(idx_db.sql_db(), fid); + auto all = idx_db.query_metadata_lines(fid); CHECK(all.size() == 3); - auto ckpt0 = queries::query_metadata_lines_for_checkpoint( - idx_db.sql_db(), fid, 0); + auto ckpt0 = idx_db.query_metadata_lines_for_checkpoint(fid, 0); CHECK(ckpt0.size() == 2); - auto ckpt1 = queries::query_metadata_lines_for_checkpoint( - idx_db.sql_db(), fid, 1); + auto ckpt1 = idx_db.query_metadata_lines_for_checkpoint(fid, 1); CHECK(ckpt1.size() == 1); fs::remove_all(test_dir); @@ -108,28 +101,27 @@ TEST_SUITE("ManifestQueries") { dft_utils_test::make_unique_test_path("test_manifest_delete") .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(index_path); idx_db.init_base_schema(); idx_db.init_manifest_schema(); int fid = idx_db.get_or_create_file_info(get_logical_path("test.pfw.gz"), 0); idx_db.begin_transaction(); - queries::insert_event_range(idx_db.sql_db(), fid, 0, "POSIX", "read", - {0, 1}); - queries::insert_metadata_lines(idx_db.sql_db(), fid, 0, "HH", {2}); + idx_db.insert_event_range(fid, 0, "POSIX", "read", {0, 1}); + idx_db.insert_metadata_lines(fid, 0, "HH", {2}); idx_db.commit_transaction(); - CHECK(queries::query_event_ranges(idx_db.sql_db(), fid).size() == 1); - CHECK(queries::query_metadata_lines(idx_db.sql_db(), fid).size() == 1); + CHECK(idx_db.query_event_ranges(fid).size() == 1); + CHECK(idx_db.query_metadata_lines(fid).size() == 1); - queries::delete_event_ranges(idx_db.sql_db(), fid); - CHECK(queries::query_event_ranges(idx_db.sql_db(), fid).empty()); + idx_db.delete_event_ranges(fid); + CHECK(idx_db.query_event_ranges(fid).empty()); - queries::delete_metadata_lines(idx_db.sql_db(), fid); - CHECK(queries::query_metadata_lines(idx_db.sql_db(), fid).empty()); + idx_db.delete_metadata_lines(fid); + CHECK(idx_db.query_metadata_lines(fid).empty()); fs::remove_all(test_dir); } diff --git a/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp b/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp index b740ee0c..f4efdc68 100644 --- a/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp +++ b/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp @@ -167,10 +167,10 @@ static void execute_extraction(const ExtractionPlan& plan, } } - std::string idx_path = + std::string index_path = internal::determine_index_path(src.file_path, index_dir); auto reader_input = - IndexedReadInput::from_file(src.file_path).with_index(idx_path); + IndexedReadInput::from_file(src.file_path).with_index(index_path); IndexedFileReaderUtility reader_utility; auto reader = reader_utility.process(reader_input).get(); @@ -243,6 +243,59 @@ static const SegmentInterval* find_segment( return nullptr; } +static void write_group_provenance( + const ExtractionPlan& plan, + const std::map& group_gz_paths, + const std::string& reorg_dir) { + for (const auto& g : plan.groups) { + auto gz_it = group_gz_paths.find(g.name); + if (gz_it == group_gz_paths.end()) continue; + const std::string& gz_path = gz_it->second; + + std::string db_root = + internal::determine_provenance_index_path(gz_path, reorg_dir); + + ProvenanceDatabase pdb(db_root); + pdb.init_schema(); + int fid = pdb.get_or_create_file_info(gz_path, 0); + REQUIRE(fid >= 0); + + pdb.begin_transaction(); + + pdb.insert_info(fid, "version", "1.0"); + pdb.insert_info(fid, "tool", "dftracer_organize"); + pdb.insert_group(fid, g.name, g.query); + + for (std::size_t si = 0; si < plan.source_files.size(); ++si) { + const auto& src = plan.source_files[si]; + pdb.insert_source(fid, static_cast(si), src.file_path, + static_cast(src.num_checkpoints), ""); + } + + std::map> + segment_events; + for (const auto& task : plan.tasks) { + if (task.target_group == g.name) { + segment_events[task.source_file_idx][task.checkpoint_idx] = + task.line_numbers.size(); + } + } + + int output_line = 0; + for (const auto& [src_idx, ckpts] : segment_events) { + for (const auto& [ckpt, count] : ckpts) { + pdb.insert_segment(fid, static_cast(src_idx), + static_cast(ckpt), output_line, + output_line + static_cast(count), + static_cast(count)); + output_line += static_cast(count); + } + } + + pdb.commit_transaction(); + } +} + TEST_SUITE("ReconstructIntegration") { TEST_CASE("Round-trip: reorganize then reconstruct") { std::string test_dir = @@ -310,54 +363,8 @@ TEST_SUITE("ReconstructIntegration") { build_idx(gz_path, reorg_dir); } - // Step 6: Write provenance into each output .pidx - for (const auto& g : plan.groups) { - auto gz_it = group_gz_paths.find(g.name); - if (gz_it == group_gz_paths.end()) continue; - const std::string& gz_path = gz_it->second; - - std::string pidx_path = - internal::determine_provenance_index_path(gz_path, reorg_dir); - - ProvenanceDatabase pdb(pidx_path); - pdb.init_schema(); - int fid = pdb.get_or_create_file_info(gz_path, 0); - REQUIRE(fid >= 0); - - pdb.begin_transaction(); - - pdb.insert_info("version", "1.0"); - pdb.insert_info("tool", "dftracer_organize"); - pdb.insert_group(g.name, g.query); - - for (std::size_t si = 0; si < plan.source_files.size(); ++si) { - const auto& src = plan.source_files[si]; - pdb.insert_source(fid, static_cast(si), src.file_path, - static_cast(src.num_checkpoints), ""); - } - - std::map> - segment_events; - for (const auto& task : plan.tasks) { - if (task.target_group == g.name) { - segment_events[task.source_file_idx][task.checkpoint_idx] = - task.line_numbers.size(); - } - } - - int output_line = 0; - for (const auto& [src_idx, ckpts] : segment_events) { - for (const auto& [ckpt, count] : ckpts) { - pdb.insert_segment(static_cast(src_idx), - static_cast(ckpt), output_line, - output_line + static_cast(count), - static_cast(count)); - output_line += static_cast(count); - } - } - - pdb.commit_transaction(); - } + // Step 6: Write provenance into the shared output-root .dftindex + write_group_provenance(plan, group_gz_paths, reorg_dir); // Step 7: Plan reconstruction std::vector reorg_files; @@ -403,18 +410,18 @@ TEST_SUITE("ReconstructIntegration") { } for (const auto& [reorg_file, intervals] : per_reorg_segments) { - std::string idx_path = + std::string index_path = internal::determine_index_path(reorg_file, reorg_dir); MetadataCollectorUtility meta_collector; auto meta_input = MetadataCollectorUtilityInput::from_file(reorg_file) - .with_index(idx_path); + .with_index(index_path); auto meta = meta_collector.process(meta_input).get(); REQUIRE(meta.success); auto reader_input = - IndexedReadInput::from_file(reorg_file).with_index(idx_path); + IndexedReadInput::from_file(reorg_file).with_index(index_path); IndexedFileReaderUtility reader_utility; auto reader = reader_utility.process(reader_input).get(); @@ -496,4 +503,97 @@ TEST_SUITE("ReconstructIntegration") { fs::remove_all(test_dir); } + + TEST_CASE( + "reconstruction planner reads multiple outputs from one shared " + ".dftindex") { + std::string test_dir = + dft_utils_test::make_unique_test_path("test_recon_shared_root") + .string(); + std::string input_dir = test_dir + "/input"; + std::string reorg_dir = test_dir + "/reorg"; + fs::create_directories(input_dir); + fs::create_directories(reorg_dir); + + std::string trace_file = create_test_trace(input_dir); + build_idx(trace_file, input_dir); + + ReorganizationPlannerUtility planner; + ReorganizationPlannerInput planner_input; + planner_input.source_files = {trace_file}; + planner_input.groups = {{"io", R"(cat == "POSIX")"}, + {"compute", R"(cat == "APP")"}}; + planner_input.index_dir = input_dir; + + auto plan = planner.process(planner_input).get(); + REQUIRE(plan.tasks.size() > 0); + + std::map group_files; + std::map group_pfw_paths; + for (const auto& g : plan.groups) { + std::string pfw_path = reorg_dir + "/" + g.name + ".pfw"; + FILE* f = std::fopen(pfw_path.c_str(), "w"); + REQUIRE(f != nullptr); + group_files[g.name] = f; + group_pfw_paths[g.name] = pfw_path; + } + + execute_extraction(plan, input_dir, group_files); + + for (auto& [_, f] : group_files) { + std::fclose(f); + } + + std::map group_gz_paths; + for (const auto& g : plan.groups) { + std::string pfw_path = group_pfw_paths[g.name]; + if (!fs::exists(pfw_path) || fs::file_size(pfw_path) == 0) continue; + + FileCompressorUtility compressor; + auto comp_result = + compressor + .process(FileCompressionUtilityInput::from_file(pfw_path)) + .get(); + REQUIRE(comp_result.success); + + std::string gz_path = pfw_path + ".gz"; + REQUIRE(fs::exists(gz_path)); + group_gz_paths[g.name] = gz_path; + fs::remove(pfw_path); + } + + for (const auto& [_, gz_path] : group_gz_paths) { + build_idx(gz_path, reorg_dir); + } + + write_group_provenance(plan, group_gz_paths, reorg_dir); + + const std::string shared_root = + determine_provenance_index_path(trace_file, reorg_dir); + REQUIRE(fs::exists(shared_root)); + + ProvenanceDatabase pdb(shared_root); + const int io_fid = pdb.get_file_info_id(group_gz_paths.at("io")); + const int compute_fid = + pdb.get_file_info_id(group_gz_paths.at("compute")); + REQUIRE(io_fid >= 0); + REQUIRE(compute_fid >= 0); + CHECK(io_fid != compute_fid); + CHECK(pdb.query_group_name(io_fid) == "io"); + CHECK(pdb.query_group_name(compute_fid) == "compute"); + + ReconstructionPlannerUtility recon_planner; + ReconstructionPlannerInput recon_input; + for (const auto& [_, gz_path] : group_gz_paths) { + recon_input.reorganized_files.push_back(gz_path); + } + recon_input.index_dir = reorg_dir; + + auto recon_plan = recon_planner.process(recon_input).get(); + REQUIRE(recon_plan.files.size() == 1); + CHECK(recon_plan.total_segments >= 2); + CHECK(recon_plan.total_events == 8); + + fs::remove_all(test_dir); + } } diff --git a/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp b/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp index e5e42a23..d0470119 100644 --- a/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp +++ b/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp @@ -43,28 +43,29 @@ TEST_SUITE("ReconstructionPlanner") { } // Create .pidx sidecar with provenance - std::string pidx_path = determine_provenance_index_path(reorg_file, ""); + std::string provenance_path = + determine_provenance_index_path(reorg_file, ""); { - ProvenanceDatabase pdb(pidx_path); + ProvenanceDatabase pdb(provenance_path); pdb.init_schema(); int fid = pdb.get_or_create_file_info(reorg_file, 0); pdb.begin_transaction(); // Provenance info - pdb.insert_info("version", "1.0"); - pdb.insert_info("tool", "dftracer_organize"); + pdb.insert_info(fid, "version", "1.0"); + pdb.insert_info(fid, "tool", "dftracer_organize"); // Provenance group - pdb.insert_group("io", "cat=POSIX"); + pdb.insert_group(fid, "io", "cat=POSIX"); // Provenance source pdb.insert_source(fid, 0, "/original/trace.pfw.gz", 3, "abc123"); // Provenance segments (3 checkpoints) - pdb.insert_segment(0, 0, 0, 100, 100); - pdb.insert_segment(0, 1, 100, 250, 150); - pdb.insert_segment(0, 2, 250, 400, 150); + pdb.insert_segment(fid, 0, 0, 0, 100, 100); + pdb.insert_segment(fid, 0, 1, 100, 250, 150); + pdb.insert_segment(fid, 0, 2, 250, 400, 150); pdb.commit_transaction(); } @@ -127,42 +128,42 @@ TEST_SUITE("ReconstructionPlanner") { // Create .pidx for io.pfw.gz { - std::string pidx_path = + std::string provenance_path = determine_provenance_index_path(io_file, ""); - ProvenanceDatabase pdb(pidx_path); + ProvenanceDatabase pdb(provenance_path); pdb.init_schema(); int fid = pdb.get_or_create_file_info(io_file, 0); pdb.begin_transaction(); - pdb.insert_info("version", "1.0"); - pdb.insert_info("tool", "dftracer_organize"); - pdb.insert_group("io", "cat=POSIX"); + pdb.insert_info(fid, "version", "1.0"); + pdb.insert_info(fid, "tool", "dftracer_organize"); + pdb.insert_group(fid, "io", "cat=POSIX"); pdb.insert_source(fid, 0, "/original/trace.pfw.gz", 2, "hash1"); // Segments for checkpoints 0 and 1 - pdb.insert_segment(0, 0, 0, 50, 50); - pdb.insert_segment(0, 1, 50, 120, 70); + pdb.insert_segment(fid, 0, 0, 0, 50, 50); + pdb.insert_segment(fid, 0, 1, 50, 120, 70); pdb.commit_transaction(); } // Create .pidx for compute.pfw.gz { - std::string pidx_path = + std::string provenance_path = determine_provenance_index_path(compute_file, ""); - ProvenanceDatabase pdb(pidx_path); + ProvenanceDatabase pdb(provenance_path); pdb.init_schema(); int fid = pdb.get_or_create_file_info(compute_file, 0); pdb.begin_transaction(); - pdb.insert_info("version", "1.0"); - pdb.insert_info("tool", "dftracer_organize"); - pdb.insert_group("compute", "cat=APP"); + pdb.insert_info(fid, "version", "1.0"); + pdb.insert_info(fid, "tool", "dftracer_organize"); + pdb.insert_group(fid, "compute", "cat=APP"); pdb.insert_source(fid, 0, "/original/trace.pfw.gz", 2, "hash1"); // Segments for checkpoints 0 and 1 - pdb.insert_segment(0, 0, 0, 30, 30); - pdb.insert_segment(0, 1, 30, 80, 50); + pdb.insert_segment(fid, 0, 0, 0, 30, 30); + pdb.insert_segment(fid, 0, 1, 30, 80, 50); pdb.commit_transaction(); } @@ -207,9 +208,10 @@ TEST_SUITE("ReconstructionPlanner") { } // Create .pidx with NO provenance tables - std::string pidx_path = determine_provenance_index_path(reorg_file, ""); + std::string provenance_path = + determine_provenance_index_path(reorg_file, ""); { - ProvenanceDatabase pdb(pidx_path); + ProvenanceDatabase pdb(provenance_path); pdb.init_schema(); pdb.get_or_create_file_info(reorg_file, 0); // No provenance data inserted diff --git a/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp b/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp index dfbd7082..d566ffa6 100644 --- a/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp +++ b/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp @@ -265,26 +265,26 @@ TEST_SUITE("ReorganizationPlanner") { std::string test_dir = dft_utils_test::make_unique_test_path("test_planner_prov").string(); fs::create_directories(test_dir); - std::string pidx_path = test_dir + "/test_prov.pfw.gz.pidx"; + std::string provenance_path = test_dir + "/test_prov.pfw.gz.pidx"; - ProvenanceDatabase pdb(pidx_path); + ProvenanceDatabase pdb(provenance_path); pdb.init_schema(); int fid = pdb.get_or_create_file_info("test.pfw.gz", 0); pdb.begin_transaction(); - pdb.insert_info("version", "1.0"); - pdb.insert_info("created_at", "2026-02-17"); + pdb.insert_info(fid, "version", "1.0"); + pdb.insert_info(fid, "created_at", "2026-02-17"); pdb.insert_source(fid, 0, "/data/trace.pfw.gz", 9, "abc123"); - pdb.insert_group("io", R"(cat == "POSIX")"); - pdb.insert_segment(0, 0, 0, 100, 50); - pdb.insert_segment(0, 1, 100, 200, 45); + pdb.insert_group(fid, "io", R"(cat == "POSIX")"); + pdb.insert_segment(fid, 0, 0, 0, 100, 50); + pdb.insert_segment(fid, 0, 1, 100, 200, 45); pdb.commit_transaction(); - CHECK(pdb.query_info("version") == "1.0"); - CHECK(pdb.query_info("created_at") == "2026-02-17"); - CHECK(pdb.query_info("nonexistent").empty()); + CHECK(pdb.query_info(fid, "version") == "1.0"); + CHECK(pdb.query_info(fid, "created_at") == "2026-02-17"); + CHECK(pdb.query_info(fid, "nonexistent").empty()); auto sources = pdb.query_sources(fid); REQUIRE(sources.size() == 1); @@ -293,7 +293,7 @@ TEST_SUITE("ReorganizationPlanner") { CHECK(sources[0].num_checkpoints == 9); CHECK(sources[0].event_hash == "abc123"); - auto segments = pdb.query_segments(0); + auto segments = pdb.query_segments(fid, 0); REQUIRE(segments.size() == 2); CHECK(segments[0].source_checkpoint == 0); CHECK(segments[0].output_line_start == 0); @@ -301,8 +301,8 @@ TEST_SUITE("ReorganizationPlanner") { CHECK(segments[0].event_count == 50); CHECK(segments[1].source_checkpoint == 1); - CHECK(pdb.query_group_name() == "io"); - CHECK(pdb.query_group_predicate() == R"(cat == "POSIX")"); + CHECK(pdb.query_group_name(fid) == "io"); + CHECK(pdb.query_group_predicate(fid) == R"(cat == "POSIX")"); fs::remove_all(test_dir); } diff --git a/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp b/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp index a65262d4..29c8e6ee 100644 --- a/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp +++ b/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp @@ -164,10 +164,10 @@ static void execute_extraction(const ExtractionPlan& plan, } } - std::string idx_path = + std::string index_path = internal::determine_index_path(src.file_path, index_dir); auto reader_input = - IndexedReadInput::from_file(src.file_path).with_index(idx_path); + IndexedReadInput::from_file(src.file_path).with_index(index_path); IndexedFileReaderUtility reader_utility; auto reader = reader_utility.process(reader_input).get(); @@ -405,11 +405,11 @@ TEST_SUITE("ReorganizeIntegration") { REQUIRE(fid >= 0); pdb.begin_transaction(); - pdb.insert_info("version", "1.0"); - pdb.insert_info("tool", "dftracer_organize"); - pdb.insert_group("io", R"(cat == "POSIX")"); + pdb.insert_info(fid, "version", "1.0"); + pdb.insert_info(fid, "tool", "dftracer_organize"); + pdb.insert_group(fid, "io", R"(cat == "POSIX")"); pdb.insert_source(fid, 0, trace_file, 1, ""); - pdb.insert_segment(0, 0, 0, 5, 3); + pdb.insert_segment(fid, 0, 0, 0, 5, 3); pdb.commit_transaction(); } @@ -420,16 +420,16 @@ TEST_SUITE("ReorganizeIntegration") { int fid = pdb.get_file_info_id(io_gz); REQUIRE(fid >= 0); - CHECK(pdb.query_info("version") == "1.0"); - CHECK(pdb.query_info("tool") == "dftracer_organize"); - CHECK(pdb.query_group_name() == "io"); - CHECK(pdb.query_group_predicate() == R"(cat == "POSIX")"); + CHECK(pdb.query_info(fid, "version") == "1.0"); + CHECK(pdb.query_info(fid, "tool") == "dftracer_organize"); + CHECK(pdb.query_group_name(fid) == "io"); + CHECK(pdb.query_group_predicate(fid) == R"(cat == "POSIX")"); auto sources = pdb.query_sources(fid); REQUIRE(sources.size() == 1); CHECK(sources[0].path == trace_file); - auto segments = pdb.query_segments(0); + auto segments = pdb.query_segments(fid, 0); REQUIRE(segments.size() == 1); CHECK(segments[0].output_line_start == 0); CHECK(segments[0].output_line_end == 5); diff --git a/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp b/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp index 7e6242c0..205c85fd 100644 --- a/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp +++ b/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp @@ -1,6 +1,6 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include -#include +#include #include #include #include @@ -12,8 +12,8 @@ #include "testing_utilities.h" using namespace dftracer::utils; +using namespace dftracer::utils::utilities::composites::dft::internal; using namespace dftracer::utils::utilities::composites::dft::indexing; -using namespace dftracer::utils::utilities::composites::dft::indexing::queries; using namespace dftracer::utils::utilities::composites::dft::statistics; using dftracer::utils::utilities::indexer::IndexDatabase; using dftracer::utils::utilities::indexer::internal::get_logical_path; @@ -22,7 +22,7 @@ static void write_chunk( IndexDatabase& db, int fid, std::uint64_t checkpoint_idx, ChunkStatistics& stats, const std::vector>& dim_values) { - queries::insert_chunk_statistics(db.sql_db(), fid, checkpoint_idx, stats); + db.insert_chunk_statistics(fid, checkpoint_idx, stats); std::unordered_map dim_stats; for (const auto& [dim, val] : dim_values) { @@ -32,14 +32,13 @@ static void write_chunk( ds.observe(val); } for (const auto& [dim, ds] : dim_stats) { - queries::insert_chunk_dimension_stats(db.sql_db(), fid, checkpoint_idx, - ds); + db.insert_chunk_dimension_stats(fid, checkpoint_idx, ds); } } -static void populate_test_idx(const std::string& idx_path, - const std::string& file_path) { - IndexDatabase idx_db(idx_path); +static void populate_test_db(const std::string& db_root, + const std::string& file_path) { + IndexDatabase idx_db(db_root); idx_db.init_base_schema(); idx_db.init_bloom_schema(); @@ -93,14 +92,15 @@ TEST_SUITE("StatisticsAggregatorUtility") { dft_utils_test::make_unique_test_path("test_stats_agg").string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string db_root = + determine_index_path(test_dir + "/test.pfw.gz", ""); std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_db(db_root, file_path); StatisticsAggregatorUtility aggregator; StatisticsAggregatorInput input; input.file_path = file_path; - input.idx_path = idx_path; + input.index_path = db_root; auto result = aggregator.process(input).get(); @@ -135,9 +135,9 @@ TEST_SUITE("StatisticsAggregatorUtility") { StatisticsAggregatorUtility aggregator; StatisticsAggregatorInput input; input.file_path = "/fake/nonexistent.pfw.gz"; - input.idx_path = - dft_utils_test::make_unique_test_path("nonexistent").string() + - ".idx"; + input.index_path = + (dft_utils_test::make_unique_test_path("nonexistent") / ".dftindex") + .string(); auto result = aggregator.process(input).get(); @@ -151,14 +151,15 @@ TEST_SUITE("StatisticsAggregatorUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string db_root = + determine_index_path(test_dir + "/test.pfw.gz", ""); std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_db(db_root, file_path); StatisticsAggregatorUtility aggregator; StatisticsAggregatorInput input; input.file_path = "/fake/other_file.pfw.gz"; - input.idx_path = idx_path; + input.index_path = db_root; auto result = aggregator.process(input).get(); @@ -174,11 +175,12 @@ TEST_SUITE("StatisticsAggregatorUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string db_root = + determine_index_path(test_dir + "/test.pfw.gz", ""); std::string file_path = "/fake/test.pfw.gz"; // Create idx with file_info but no chunk_statistics - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(db_root); idx_db.init_base_schema(); idx_db.init_bloom_schema(); idx_db.get_or_create_file_info(get_logical_path(file_path), 12345); @@ -186,7 +188,7 @@ TEST_SUITE("StatisticsAggregatorUtility") { StatisticsAggregatorUtility aggregator; StatisticsAggregatorInput input; input.file_path = file_path; - input.idx_path = idx_path; + input.index_path = db_root; auto result = aggregator.process(input).get(); @@ -204,10 +206,11 @@ TEST_SUITE("StatisticsAggregatorUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string db_root = + determine_index_path(test_dir + "/test.pfw.gz", ""); std::string file_path = "/fake/test.pfw.gz"; - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(db_root); idx_db.init_base_schema(); idx_db.init_bloom_schema(); int fid = @@ -220,7 +223,7 @@ TEST_SUITE("StatisticsAggregatorUtility") { ChunkStatistics stats; stats.update_from_event("op", "cat", 1, 1, 1000, 10); stats.update_from_event("op", "cat", 1, 1, 2000, 20); - queries::insert_chunk_statistics(idx_db.sql_db(), fid, 0, stats); + idx_db.insert_chunk_statistics(fid, 0, stats); } // Chunk 1: durations 30, 40, 50 @@ -229,7 +232,7 @@ TEST_SUITE("StatisticsAggregatorUtility") { stats.update_from_event("op", "cat", 1, 1, 3000, 30); stats.update_from_event("op", "cat", 1, 1, 4000, 40); stats.update_from_event("op", "cat", 1, 1, 5000, 50); - queries::insert_chunk_statistics(idx_db.sql_db(), fid, 1, stats); + idx_db.insert_chunk_statistics(fid, 1, stats); } idx_db.commit_transaction(); @@ -237,7 +240,7 @@ TEST_SUITE("StatisticsAggregatorUtility") { StatisticsAggregatorUtility aggregator; StatisticsAggregatorInput input; input.file_path = file_path; - input.idx_path = idx_path; + input.index_path = db_root; auto result = aggregator.process(input).get(); diff --git a/tests/utilities/composites/dft/statistics/test_statistics_query.cpp b/tests/utilities/composites/dft/statistics/test_statistics_query.cpp index 8d98f2f8..125fc6a2 100644 --- a/tests/utilities/composites/dft/statistics/test_statistics_query.cpp +++ b/tests/utilities/composites/dft/statistics/test_statistics_query.cpp @@ -13,7 +13,7 @@ static TraceStatistics make_test_stats() { ts.success = true; ts.num_chunks = 2; ts.file_path = "/test/file.pfw.gz"; - ts.idx_path = "/test/file.pfw.gz.idx"; + ts.index_path = "/test/file.pfw.gz.idx"; // Simulate a variety of events ts.merged.update_from_event("read", "POSIX", 1, 1, 1000, 100); diff --git a/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp b/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp index 16e3a0e7..6af11bcb 100644 --- a/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp +++ b/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp @@ -53,7 +53,7 @@ TEST_SUITE("TraceStatistics") { TEST_CASE("TraceStatistics - to_json produces valid JSON") { TraceStatistics ts; ts.file_path = "/test/file.pfw.gz"; - ts.idx_path = "/test/file.pfw.gz.idx"; + ts.index_path = "/test/file.pfw.gz.idx"; ts.success = true; ts.num_chunks = 2; @@ -97,7 +97,7 @@ TEST_SUITE("TraceStatistics") { TEST_CASE("TraceStatistics - to_json with error") { TraceStatistics ts; ts.file_path = "/test/missing.pfw.gz"; - ts.idx_path = "/test/missing.pfw.gz.idx"; + ts.index_path = "/test/missing.pfw.gz.idx"; ts.success = false; ts.error_message = "File not found"; diff --git a/tests/utilities/composites/dft/test_index_builder.cpp b/tests/utilities/composites/dft/test_index_builder.cpp index 85be7531..19ae5f69 100644 --- a/tests/utilities/composites/dft/test_index_builder.cpp +++ b/tests/utilities/composites/dft/test_index_builder.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,7 @@ using namespace dftracer::utils; using namespace dftracer::utils::utilities::indexer; using namespace dftracer::utils::utilities::behaviors; +using namespace dftracer::utils::utilities::composites::dft::internal; using namespace dft_utils_test; namespace tags = dftracer::utils::utilities::tags; @@ -46,7 +48,7 @@ TEST_SUITE("IndexBuilder") { SUBCASE("Build index for gzip file") { std::string gz_file = env.create_dft_test_gzip_file(50); - std::string idx_path = gz_file + ".idx"; + std::string db_root = determine_index_path(gz_file, ""); auto input = IndexBuildConfig::for_file(gz_file) .with_index_dir("") @@ -56,16 +58,15 @@ TEST_SUITE("IndexBuilder") { auto output = run_builder(input); CHECK(output.file_path == gz_file); - CHECK(output.idx_path == idx_path); + CHECK(output.index_path == db_root); CHECK(output.success == true); CHECK(output.was_skipped == false); - CHECK(fs::exists(idx_path)); + CHECK(fs::exists(db_root)); } SUBCASE("Use existing index without force rebuild") { std::string gz_file = env.create_dft_test_gzip_file(20); - std::string idx_path = gz_file + ".idx"; auto input1 = IndexBuildConfig::for_file(gz_file) .with_index_dir("") @@ -85,7 +86,6 @@ TEST_SUITE("IndexBuilder") { TestEnvironment env(100); std::string gz_file = env.create_dft_test_gzip_file(30); - std::string idx_path = gz_file + ".idx"; auto input = IndexBuildConfig::for_file(gz_file) .with_index_dir("") diff --git a/tests/utilities/composites/dft/test_metadata_collector.cpp b/tests/utilities/composites/dft/test_metadata_collector.cpp index 5762974e..7e4af6c1 100644 --- a/tests/utilities/composites/dft/test_metadata_collector.cpp +++ b/tests/utilities/composites/dft/test_metadata_collector.cpp @@ -1,6 +1,7 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include +#include #include #include @@ -145,9 +146,10 @@ TEST_SUITE("MetadataCollector") { REQUIRE(fs::exists(gz_file)); // Create input with index - std::string idx_path = gz_file + ".idx"; + std::string index_path = + internal::determine_index_path(gz_file, ""); auto input = MetadataCollectorUtilityInput::from_file(gz_file) - .with_index(idx_path) + .with_index(index_path) .with_checkpoint_size(1024 * 1024) // 1MB .with_force_rebuild(true) .with_compute_hash(true); @@ -165,7 +167,7 @@ TEST_SUITE("MetadataCollector") { CHECK(output.format == ArchiveFormat::GZIP); CHECK(output.has_index == true); CHECK(output.index_valid == true); - CHECK(output.idx_path == idx_path); + CHECK(output.index_path == index_path); CHECK(output.compressed_size > 0); CHECK(output.uncompressed_size > 0); CHECK(output.compressed_size < @@ -175,7 +177,7 @@ TEST_SUITE("MetadataCollector") { CHECK(output.error_message.empty()); // Verify index file was created - CHECK(fs::exists(idx_path)); + CHECK(fs::exists(index_path)); } SUBCASE("Reuse existing index") { @@ -186,12 +188,13 @@ TEST_SUITE("MetadataCollector") { int result = std::system(cmd.c_str()); REQUIRE(result == 0); - std::string idx_path = gz_file + ".idx"; + std::string index_path = + internal::determine_index_path(gz_file, ""); // First run - build index { auto input = MetadataCollectorUtilityInput::from_file(gz_file) - .with_index(idx_path) + .with_index(index_path) .with_force_rebuild(true) .with_compute_hash(true); @@ -201,13 +204,13 @@ TEST_SUITE("MetadataCollector") { CHECK(output.success == true); CHECK(output.has_index == true); CHECK(output.index_valid == true); - CHECK(fs::exists(idx_path)); + CHECK(fs::exists(index_path)); } // Second run - reuse index (no force rebuild) { auto input = MetadataCollectorUtilityInput::from_file(gz_file) - .with_index(idx_path) + .with_index(index_path) .with_force_rebuild(false) .with_compute_hash(true); diff --git a/tests/utilities/composites/dft/views/test_view_builder.cpp b/tests/utilities/composites/dft/views/test_view_builder.cpp index 17f06cac..ca2c86f3 100644 --- a/tests/utilities/composites/dft/views/test_view_builder.cpp +++ b/tests/utilities/composites/dft/views/test_view_builder.cpp @@ -1,7 +1,6 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include -#include #include #include #include @@ -24,9 +23,9 @@ using dftracer::utils::utilities::indexer::internal::get_logical_path; // 1: name={open,close}, cat={POSIX} // 2: name={train}, cat={compute} // 3: name={forward}, cat={compute,ai_framework} -static void populate_test_idx(const std::string& idx_path, +static void populate_test_idx(const std::string& index_path, const std::string& file_path) { - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(index_path); idx_db.init_base_schema(); idx_db.init_bloom_schema(); @@ -58,10 +57,9 @@ static void populate_test_idx(const std::string& idx_path, file_name_bloom.add(n); } auto name_blob = name_bloom.serialize(); - queries::insert_chunk_bloom_filter( - idx_db.sql_db(), fid, static_cast(ckpt), "name", - name_blob.data(), static_cast(name_blob.size()), - name_bloom.num_entries()); + idx_db.insert_chunk_bloom_filter( + fid, static_cast(ckpt), "name", name_blob.data(), + static_cast(name_blob.size()), name_bloom.num_entries()); BloomFilter cat_bloom(100, 0.01); for (const auto& c : chunks[ckpt].cats) { @@ -69,25 +67,24 @@ static void populate_test_idx(const std::string& idx_path, file_cat_bloom.add(c); } auto cat_blob = cat_bloom.serialize(); - queries::insert_chunk_bloom_filter( - idx_db.sql_db(), fid, static_cast(ckpt), "cat", - cat_blob.data(), static_cast(cat_blob.size()), - cat_bloom.num_entries()); + idx_db.insert_chunk_bloom_filter( + fid, static_cast(ckpt), "cat", cat_blob.data(), + static_cast(cat_blob.size()), cat_bloom.num_entries()); } // File-level bloom filters auto name_blob = file_name_bloom.serialize(); - queries::insert_file_bloom_filter( - idx_db.sql_db(), fid, "name", name_blob.data(), - static_cast(name_blob.size()), file_name_bloom.num_entries()); + idx_db.insert_file_bloom_filter(fid, "name", name_blob.data(), + static_cast(name_blob.size()), + file_name_bloom.num_entries()); auto cat_blob = file_cat_bloom.serialize(); - queries::insert_file_bloom_filter( - idx_db.sql_db(), fid, "cat", cat_blob.data(), - static_cast(cat_blob.size()), file_cat_bloom.num_entries()); + idx_db.insert_file_bloom_filter(fid, "cat", cat_blob.data(), + static_cast(cat_blob.size()), + file_cat_bloom.num_entries()); - queries::insert_index_dimension(idx_db.sql_db(), fid, "name"); - queries::insert_index_dimension(idx_db.sql_db(), fid, "cat"); + idx_db.insert_index_dimension(fid, "name"); + idx_db.insert_index_dimension(fid, "cat"); idx_db.commit_transaction(); } @@ -99,14 +96,14 @@ TEST_SUITE("ViewBuilderUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); ViewBuilderInput input; input.with_view(ViewDefinition::io_view()) .with_file_path(file_path) - .with_idx_path(idx_path) + .with_index_path(index_path) .with_uncompressed_size(40000) .with_num_checkpoints(4); @@ -135,14 +132,14 @@ TEST_SUITE("ViewBuilderUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); ViewBuilderInput input; input.with_view(ViewDefinition::compute_view()) .with_file_path(file_path) - .with_idx_path(idx_path) + .with_index_path(index_path) .with_uncompressed_size(40000) .with_num_checkpoints(4); @@ -165,9 +162,9 @@ TEST_SUITE("ViewBuilderUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); ViewDefinition view; view.with_name("nonexistent").with_query(R"(cat == "NONEXISTENT")"); @@ -175,7 +172,7 @@ TEST_SUITE("ViewBuilderUtility") { ViewBuilderInput input; input.with_view(view) .with_file_path(file_path) - .with_idx_path(idx_path) + .with_index_path(index_path) .with_uncompressed_size(40000) .with_num_checkpoints(4); @@ -196,9 +193,9 @@ TEST_SUITE("ViewBuilderUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; - populate_test_idx(idx_path, file_path); + populate_test_idx(index_path, file_path); ViewDefinition view; view.with_name("time_only").with_query(R"(ts >= 0 and ts <= 100000)"); @@ -206,7 +203,7 @@ TEST_SUITE("ViewBuilderUtility") { ViewBuilderInput input; input.with_view(view) .with_file_path(file_path) - .with_idx_path(idx_path) + .with_index_path(index_path) .with_uncompressed_size(40000) .with_num_checkpoints(4); @@ -228,7 +225,7 @@ TEST_SUITE("ViewBuilderUtility") { ViewBuilderInput input; input.with_view(view) .with_file_path("/fake/file.pfw.gz") - .with_idx_path("") // No bloom index + .with_index_path("") // No bloom index .with_uncompressed_size(30000) .with_num_checkpoints(3); @@ -249,7 +246,7 @@ TEST_SUITE("ViewBuilderUtility") { ViewBuilderInput input; input.with_view(view) .with_file_path("/fake/file.pfw.gz") - .with_idx_path("") + .with_index_path("") .with_uncompressed_size(12000) .with_num_checkpoints(3); @@ -281,7 +278,7 @@ TEST_SUITE("ViewBuilderUtility") { ViewBuilderInput input; input.with_view(view) .with_file_path("/fake/file.pfw.gz") - .with_idx_path("") + .with_index_path("") .with_uncompressed_size(10000) .with_num_checkpoints(0); @@ -301,11 +298,11 @@ TEST_SUITE("ViewBuilderUtility") { .string(); fs::create_directories(test_dir); - std::string idx_path = test_dir + "/test.pfw.gz.idx"; + std::string index_path = test_dir + "/test.pfw.gz.idx"; std::string file_path = "/fake/test.pfw.gz"; // Create idx with fhash dimension - IndexDatabase idx_db(idx_path); + IndexDatabase idx_db(index_path); idx_db.init_base_schema(); idx_db.init_bloom_schema(); int fid = @@ -316,15 +313,14 @@ TEST_SUITE("ViewBuilderUtility") { fhash_bloom.add("hash123"); auto blob = fhash_bloom.serialize(); - queries::insert_file_bloom_filter( - idx_db.sql_db(), fid, "fhash", blob.data(), - static_cast(blob.size()), fhash_bloom.num_entries()); - queries::insert_chunk_bloom_filter( - idx_db.sql_db(), fid, 0, "fhash", blob.data(), - static_cast(blob.size()), fhash_bloom.num_entries()); - queries::insert_index_dimension(idx_db.sql_db(), fid, "fhash"); - queries::insert_hash_resolution(idx_db.sql_db(), fid, "fhash", - "hash123", "/data/file.h5"); + idx_db.insert_file_bloom_filter(fid, "fhash", blob.data(), + static_cast(blob.size()), + fhash_bloom.num_entries()); + idx_db.insert_chunk_bloom_filter(fid, 0, "fhash", blob.data(), + static_cast(blob.size()), + fhash_bloom.num_entries()); + idx_db.insert_index_dimension(fid, "fhash"); + idx_db.insert_hash_resolution(fid, "fhash", "hash123", "/data/file.h5"); idx_db.commit_transaction(); // Use "file" alias which should resolve to "fhash" @@ -334,7 +330,7 @@ TEST_SUITE("ViewBuilderUtility") { ViewBuilderInput input; input.with_view(view) .with_file_path(file_path) - .with_idx_path(idx_path) + .with_index_path(index_path) .with_uncompressed_size(10000) .with_num_checkpoints(1); diff --git a/tests/utilities/composites/dft/views/test_view_reader.cpp b/tests/utilities/composites/dft/views/test_view_reader.cpp index 4c38a49b..b988040b 100644 --- a/tests/utilities/composites/dft/views/test_view_reader.cpp +++ b/tests/utilities/composites/dft/views/test_view_reader.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,7 @@ #include using namespace dftracer::utils; +using namespace dftracer::utils::utilities::composites::dft::internal; using namespace dftracer::utils::utilities::composites::dft::views; using namespace dft_utils_test; using dftracer::utils::utilities::common::query::Query; @@ -62,10 +64,11 @@ TEST_SUITE("ViewReader") { TestEnvironment env(200); REQUIRE(env.is_valid()); std::string gz = create_pfw_gz(env, 50); + std::string db_root = determine_index_path(gz, ""); ViewReaderInput input; input.with_file_path(gz) - .with_idx_path(gz + ".idx") + .with_index_path(db_root) .with_checkpoint_size(1024) .with_byte_range(0, std::numeric_limits::max()); input.view.with_include_metadata(false); @@ -82,10 +85,11 @@ TEST_SUITE("ViewReader") { TestEnvironment env(200); REQUIRE(env.is_valid()); std::string gz = create_pfw_gz(env, 50); + std::string db_root = determine_index_path(gz, ""); ViewReaderInput input; input.with_file_path(gz) - .with_idx_path(gz + ".idx") + .with_index_path(db_root) .with_checkpoint_size(1024) .with_byte_range(0, std::numeric_limits::max()); input.view.with_include_metadata(false); @@ -105,10 +109,11 @@ TEST_SUITE("ViewReader") { TestEnvironment env(200); REQUIRE(env.is_valid()); std::string gz = create_pfw_gz(env, 50); + std::string db_root = determine_index_path(gz, ""); ViewReaderInput input; input.with_file_path(gz) - .with_idx_path(gz + ".idx") + .with_index_path(db_root) .with_checkpoint_size(1024) .with_byte_range(0, std::numeric_limits::max()); input.view.with_include_metadata(false); diff --git a/tests/utilities/composites/test_indexed_file_reader.cpp b/tests/utilities/composites/test_indexed_file_reader.cpp index b0abe56b..e8441d57 100644 --- a/tests/utilities/composites/test_indexed_file_reader.cpp +++ b/tests/utilities/composites/test_indexed_file_reader.cpp @@ -1,4 +1,5 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include #include #include #include @@ -14,6 +15,7 @@ using namespace dftracer::utils; using namespace dftracer::utils::utilities::indexer::internal; using namespace dftracer::utils::utilities::reader::internal; using namespace dftracer::utils::utilities::composites; +using namespace dftracer::utils::utilities::composites::dft::internal; using namespace dft_utils_test; TEST_SUITE("IndexedFileReader") { @@ -21,23 +23,23 @@ TEST_SUITE("IndexedFileReader") { SUBCASE("Process gzip file without existing index") { TestEnvironment env(10); std::string gz_path = env.create_test_gzip_file(); - std::string idx_path = gz_path + ".idx"; + std::string db_root = determine_index_path(gz_path, ""); // Ensure no index exists initially - if (fs::exists(idx_path)) { - fs::remove(idx_path); + if (fs::exists(db_root)) { + fs::remove_all(db_root); } IndexedFileReaderUtility reader_utility; IndexedReadInput input = IndexedReadInput::from_file(gz_path) - .with_index(idx_path) + .with_index(db_root) .with_checkpoint_size(1024); // Process should create index and return reader auto reader = reader_utility.process(input).get(); CHECK(reader != nullptr); - CHECK(fs::exists(idx_path)); // Index should be created + CHECK(fs::exists(db_root)); // Verify reader can read lines auto stream = @@ -55,45 +57,37 @@ TEST_SUITE("IndexedFileReader") { SUBCASE("Process gzip file with existing index") { TestEnvironment env(5); std::string gz_path = env.create_test_gzip_file(); - std::string idx_path = gz_path + ".idx"; + std::string db_root = determine_index_path(gz_path, ""); // Create index first - auto indexer = - IndexerFactory::create(gz_path, idx_path, 1024, true); + auto indexer = IndexerFactory::create(gz_path, db_root, 1024, true); REQUIRE(indexer != nullptr); indexer->build(); - REQUIRE(fs::exists(idx_path)); - - // Get initial modification time - auto initial_mtime = fs::last_write_time(idx_path); + REQUIRE(fs::exists(db_root)); // Process with existing index (should not rebuild) IndexedFileReaderUtility reader_utility; IndexedReadInput input = IndexedReadInput::from_file(gz_path) - .with_index(idx_path) + .with_index(db_root) .with_checkpoint_size(1024); auto reader = reader_utility.process(input).get(); CHECK(reader != nullptr); - CHECK(fs::exists(idx_path)); - - // Index should not be rebuilt (same modification time) - auto current_mtime = fs::last_write_time(idx_path); - CHECK(current_mtime == initial_mtime); + CHECK(fs::exists(db_root)); + CHECK(reader->get_num_lines() > 0); } SUBCASE("Force rebuild existing index") { TestEnvironment env(5); std::string gz_path = env.create_test_gzip_file(); - std::string idx_path = gz_path + ".idx"; + std::string db_root = determine_index_path(gz_path, ""); // Create index first - auto indexer = - IndexerFactory::create(gz_path, idx_path, 1024, true); + auto indexer = IndexerFactory::create(gz_path, db_root, 1024, true); REQUIRE(indexer != nullptr); indexer->build(); - REQUIRE(fs::exists(idx_path)); + REQUIRE(fs::exists(db_root)); // Sleep to ensure different timestamp std::this_thread::sleep_for(std::chrono::milliseconds(10)); @@ -101,14 +95,14 @@ TEST_SUITE("IndexedFileReader") { // Process with force rebuild IndexedFileReaderUtility reader_utility; IndexedReadInput input = IndexedReadInput::from_file(gz_path) - .with_index(idx_path) + .with_index(db_root) .with_checkpoint_size(1024) .with_force_rebuild(true); auto reader = reader_utility.process(input).get(); CHECK(reader != nullptr); - CHECK(fs::exists(idx_path)); + CHECK(fs::exists(db_root)); // Reader should work CHECK(reader->get_num_lines() > 0); @@ -119,19 +113,19 @@ TEST_SUITE("IndexedFileReader") { SUBCASE("Configure checkpoint size") { TestEnvironment env(20); std::string gz_path = env.create_test_gzip_file(); - std::string idx_path = gz_path + ".idx"; + std::string db_root = determine_index_path(gz_path, ""); IndexedFileReaderUtility reader_utility; // Use custom checkpoint size IndexedReadInput input = IndexedReadInput::from_file(gz_path) - .with_index(idx_path) + .with_index(db_root) .with_checkpoint_size(2048); auto reader = reader_utility.process(input).get(); CHECK(reader != nullptr); - CHECK(fs::exists(idx_path)); + CHECK(fs::exists(db_root)); // Verify reader works CHECK(reader->get_num_lines() == 20); @@ -145,7 +139,7 @@ TEST_SUITE("IndexedFileReader") { // Test fluent API auto input = IndexedReadInput::from_file(gz_path) - .with_index(gz_path + ".idx") + .with_index(determine_index_path(gz_path, "")) .with_checkpoint_size(512) .with_force_rebuild(false); @@ -158,17 +152,17 @@ TEST_SUITE("IndexedFileReader") { SUBCASE("Constructor with all parameters") { TestEnvironment env(5); std::string gz_path = env.create_test_gzip_file(); - std::string idx_path = gz_path + ".idx"; + std::string db_root = determine_index_path(gz_path, ""); IndexedFileReaderUtility reader_utility; // Use constructor directly - IndexedReadInput input(gz_path, idx_path, 1024, false); + IndexedReadInput input(gz_path, db_root, 1024, false); auto reader = reader_utility.process(input).get(); CHECK(reader != nullptr); - CHECK(fs::exists(idx_path)); + CHECK(fs::exists(db_root)); } } @@ -177,7 +171,7 @@ TEST_SUITE("IndexedFileReader") { IndexedFileReaderUtility reader_utility; IndexedReadInput input = IndexedReadInput::from_file("non_existent.gz") - .with_index("non_existent.gz.idx"); + .with_index("non_existent.gz.dftindex"); CHECK_THROWS_AS(reader_utility.process(input).get(), std::runtime_error); @@ -187,7 +181,7 @@ TEST_SUITE("IndexedFileReader") { IndexedFileReaderUtility reader_utility; IndexedReadInput input = IndexedReadInput::from_file("/invalid/path/file.gz") - .with_index("/invalid/path/file.gz.idx"); + .with_index("/invalid/path/.dftindex"); CHECK_THROWS_AS(reader_utility.process(input).get(), std::runtime_error); @@ -196,7 +190,7 @@ TEST_SUITE("IndexedFileReader") { SUBCASE("Empty file path") { IndexedFileReaderUtility reader_utility; IndexedReadInput input = - IndexedReadInput::from_file("").with_index("file.gz.idx"); + IndexedReadInput::from_file("").with_index(".dftindex"); CHECK_THROWS_AS(reader_utility.process(input).get(), std::runtime_error); @@ -210,8 +204,8 @@ TEST_SUITE("IndexedFileReader") { IndexedFileReaderUtility reader_utility; IndexedReadInput input = - IndexedReadInput::from_file(gz_path).with_index(gz_path + - ".idx"); + IndexedReadInput::from_file(gz_path).with_index( + determine_index_path(gz_path, "")); auto reader = reader_utility.process(input).get(); @@ -240,13 +234,14 @@ TEST_SUITE("IndexedFileReader") { SUBCASE("Rebuild when file modified after index") { TestEnvironment env(5); std::string gz_path = env.create_test_gzip_file(); - std::string idx_path = gz_path + ".idx"; + std::string index_path = gz_path + ".idx"; + std::string db_root = determine_index_path(gz_path, ""); // Create index auto indexer = - IndexerFactory::create(gz_path, idx_path, 1024, true); + IndexerFactory::create(gz_path, index_path, 1024, true); indexer->build(); - REQUIRE(fs::exists(idx_path)); + REQUIRE(fs::exists(db_root)); // Sleep to ensure different timestamp std::this_thread::sleep_for(std::chrono::milliseconds(100)); @@ -258,7 +253,7 @@ TEST_SUITE("IndexedFileReader") { // Process should detect outdated index and rebuild IndexedFileReaderUtility reader_utility; IndexedReadInput input = - IndexedReadInput::from_file(gz_path).with_index(idx_path); + IndexedReadInput::from_file(gz_path).with_index(index_path); auto reader = reader_utility.process(input).get(); @@ -271,24 +266,23 @@ TEST_SUITE("IndexedFileReader") { SUBCASE("No rebuild when index is up to date") { TestEnvironment env(5); std::string gz_path = env.create_test_gzip_file(); - std::string idx_path = gz_path + ".idx"; + std::string index_path = gz_path + ".idx"; + std::string db_root = determine_index_path(gz_path, ""); // Create index auto indexer = - IndexerFactory::create(gz_path, idx_path, 1024, true); + IndexerFactory::create(gz_path, index_path, 1024, true); indexer->build(); - auto initial_mtime = fs::last_write_time(idx_path); - // Process without modifying file IndexedFileReaderUtility reader_utility; IndexedReadInput input = - IndexedReadInput::from_file(gz_path).with_index(idx_path); + IndexedReadInput::from_file(gz_path).with_index(index_path); auto reader = reader_utility.process(input).get(); CHECK(reader != nullptr); - auto final_mtime = fs::last_write_time(idx_path); - CHECK(initial_mtime == final_mtime); + CHECK(fs::exists(db_root)); + CHECK(reader->get_num_lines() > 0); } } @@ -338,7 +332,8 @@ TEST_SUITE("IndexedFileReader") { CHECK(reader->get_num_lines() == 10); CHECK(reader->get_archive_path() == gz_path); - CHECK(reader->get_idx_path() == gz_path + ".idx"); + CHECK(reader->get_index_path() == + determine_index_path(gz_path, "")); } } } diff --git a/tests/utilities/composites/test_line_batch_processor.cpp b/tests/utilities/composites/test_line_batch_processor.cpp index eddb45fa..50dbd490 100644 --- a/tests/utilities/composites/test_line_batch_processor.cpp +++ b/tests/utilities/composites/test_line_batch_processor.cpp @@ -66,11 +66,11 @@ TEST_SUITE("LineBatchProcessor") { SUBCASE("Process lines from compressed file") { TestEnvironment env(15); std::string gz_path = env.create_test_gzip_file(); - std::string idx_path = gz_path + ".idx"; + std::string index_path = gz_path + ".idx"; // Create index auto indexer = - IndexerFactory::create(gz_path, idx_path, 1024, true); + IndexerFactory::create(gz_path, index_path, 1024, true); REQUIRE(indexer != nullptr); indexer->build(); @@ -85,7 +85,7 @@ TEST_SUITE("LineBatchProcessor") { LineReadInput input; input.file_path = gz_path; - input.idx_path = idx_path; + input.index_path = index_path; auto results = batch.process(input).get(); @@ -217,11 +217,11 @@ TEST_SUITE("LineBatchProcessor") { SUBCASE("Process line range from compressed file") { TestEnvironment env(20); std::string gz_path = env.create_test_gzip_file(); - std::string idx_path = gz_path + ".idx"; + std::string index_path = gz_path + ".idx"; // Create index auto indexer = - IndexerFactory::create(gz_path, idx_path, 1024, true); + IndexerFactory::create(gz_path, index_path, 1024, true); REQUIRE(indexer != nullptr); indexer->build(); @@ -236,7 +236,7 @@ TEST_SUITE("LineBatchProcessor") { LineReadInput input; input.file_path = gz_path; - input.idx_path = idx_path; + input.index_path = index_path; input.start_line = 10; input.end_line = 15; diff --git a/tests/utilities/fileio/lines/test_streaming_line_reader.cpp b/tests/utilities/fileio/lines/test_streaming_line_reader.cpp index 486b199f..a00446b7 100644 --- a/tests/utilities/fileio/lines/test_streaming_line_reader.cpp +++ b/tests/utilities/fileio/lines/test_streaming_line_reader.cpp @@ -1,5 +1,6 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include +#include #include #include #include @@ -15,14 +16,10 @@ using namespace dftracer::utils::utilities::indexer::internal; using namespace dft_utils_test; TEST_SUITE("StreamingLineReader") { - fs::path test_file = - make_unique_test_path("test_streaming_line_reader.txt"); - fs::path gz_file = make_unique_test_path("test_streaming_line_reader.gz"); - fs::path tar_gz_file = make_unique_test_path("test_archive.tar.gz"); - fs::path tgz_file = make_unique_test_path("test_archive.tgz"); - TEST_CASE("StreamingLineReader - Basic Plain File Reading") { SUBCASE("Read entire plain text file") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); ofs << "Line 1\n"; @@ -50,6 +47,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("Read plain file with line range") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); for (int i = 1; i <= 10; ++i) { @@ -78,6 +77,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("Read empty file") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); // Create empty file } @@ -93,6 +94,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("Direct read_plain method") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); ofs << "Direct line 1\n"; @@ -119,6 +122,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("Direct read_plain with line range") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); for (int i = 1; i <= 5; ++i) { @@ -146,6 +151,11 @@ TEST_SUITE("StreamingLineReader") { TEST_CASE("StreamingLineReader - Format Detection") { SUBCASE("Detect .gz extension without index") { + fs::path test_dir = + make_unique_test_path("test_streaming_line_reader_gz_dir"); + fs::create_directories(test_dir); + fs::path gz_file = test_dir / "test_streaming_line_reader.gz"; + // Create a file with .gz extension (not actually compressed) { std::ofstream ofs(gz_file); @@ -153,9 +163,10 @@ TEST_SUITE("StreamingLineReader") { } // Ensure no index file exists - std::string idx_path = gz_file.string() + ".idx"; - if (fs::exists(idx_path)) { - fs::remove(idx_path); + std::string index_path = dftracer::utils::utilities::composites:: + dft::internal::determine_index_path(gz_file.string(), ""); + if (fs::exists(index_path)) { + fs::remove_all(index_path); } auto config = @@ -170,15 +181,26 @@ TEST_SUITE("StreamingLineReader") { CHECK(std::string(line.content) == "Fake gz content"); } - fs::remove(gz_file); + fs::remove_all(test_dir); } SUBCASE("Detect .tar.gz extension") { + fs::path test_dir = + make_unique_test_path("test_streaming_line_reader_targz_dir"); + fs::create_directories(test_dir); + fs::path tar_gz_file = test_dir / "test_archive.tar.gz"; + { std::ofstream ofs(tar_gz_file); ofs << "Fake tar.gz content\n"; } + std::string index_path = dftracer::utils::utilities::composites:: + dft::internal::determine_index_path(tar_gz_file.string(), ""); + if (fs::exists(index_path)) { + fs::remove_all(index_path); + } + auto config = StreamingLineReaderConfig().with_file(tar_gz_file.string()); @@ -191,15 +213,26 @@ TEST_SUITE("StreamingLineReader") { CHECK(std::string(line.content) == "Fake tar.gz content"); } - fs::remove(tar_gz_file); + fs::remove_all(test_dir); } SUBCASE("Detect .tgz extension") { + fs::path test_dir = + make_unique_test_path("test_streaming_line_reader_tgz_dir"); + fs::create_directories(test_dir); + fs::path tgz_file = test_dir / "test_archive.tgz"; + { std::ofstream ofs(tgz_file); ofs << "Fake tgz content\n"; } + std::string index_path = dftracer::utils::utilities::composites:: + dft::internal::determine_index_path(tgz_file.string(), ""); + if (fs::exists(index_path)) { + fs::remove_all(index_path); + } + auto config = StreamingLineReaderConfig().with_file(tgz_file.string()); @@ -211,7 +244,7 @@ TEST_SUITE("StreamingLineReader") { CHECK(std::string(line.content) == "Fake tgz content"); } - fs::remove(tgz_file); + fs::remove_all(test_dir); } SUBCASE("Auto-detect index file with real compressed file") { @@ -224,10 +257,11 @@ TEST_SUITE("StreamingLineReader") { REQUIRE(indexer != nullptr); indexer->build(); // Actually build the index - std::string idx_path = gz_path + ".idx"; + std::string index_path = dftracer::utils::utilities::composites:: + dft::internal::determine_index_path(gz_path, ""); // Verify index file was created - CHECK(fs::exists(idx_path)); + CHECK(fs::exists(index_path)); auto config = StreamingLineReaderConfig().with_file(gz_path); @@ -262,14 +296,16 @@ TEST_SUITE("StreamingLineReader") { REQUIRE(indexer != nullptr); indexer->build(); - std::string idx_path = gz_path + ".idx"; + std::string index_path = dftracer::utils::utilities::composites:: + dft::internal::determine_index_path(gz_path, ""); + CHECK(fs::exists(index_path)); // Test with explicit index path auto config = StreamingLineReaderConfig().with_file(gz_path).with_index( - idx_path); + index_path); - CHECK(config.index_path() == idx_path); + CHECK(config.index_path() == index_path); auto range = StreamingLineReader::read(config); @@ -284,6 +320,8 @@ TEST_SUITE("StreamingLineReader") { TEST_CASE("StreamingLineReader - Configuration API") { SUBCASE("Fluent configuration API") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); for (int i = 1; i <= 10; ++i) { @@ -340,6 +378,8 @@ TEST_SUITE("StreamingLineReader") { TEST_CASE("StreamingLineReader - Special Cases") { SUBCASE("File with no trailing newline") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); ofs << "Line 1\n"; @@ -365,6 +405,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("File with empty lines") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); ofs << "Line 1\n"; @@ -396,6 +438,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("Very long lines") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); std::string long_line(10000, 'A'); @@ -437,6 +481,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("Line range beyond file") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); ofs << "Line 1\n"; @@ -464,6 +510,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("Line range starting beyond file") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); ofs << "Line 1\n"; @@ -484,6 +532,8 @@ TEST_SUITE("StreamingLineReader") { TEST_CASE("StreamingLineReader - Large Files") { SUBCASE("Many lines") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); for (int i = 1; i <= 1000; ++i) { @@ -517,6 +567,8 @@ TEST_SUITE("StreamingLineReader") { TEST_CASE("StreamingLineReader - Real World Scenarios") { SUBCASE("CSV file processing") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); ofs << "Name,Age,City\n"; @@ -545,6 +597,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("Log file processing") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); ofs << "2024-01-01 INFO: Application started\n"; @@ -576,6 +630,8 @@ TEST_SUITE("StreamingLineReader") { } SUBCASE("JSONL file processing") { + fs::path test_file = + make_unique_test_path("test_streaming_line_reader.txt"); { std::ofstream ofs(test_file); ofs << R"({"id": 1, "name": "Item 1"})" << "\n"; @@ -614,7 +670,9 @@ TEST_SUITE("StreamingLineReader") { REQUIRE(indexer != nullptr); indexer->build(); - std::string idx_path = gz_path + ".idx"; + std::string index_path = dftracer::utils::utilities::composites:: + dft::internal::determine_index_path(gz_path, ""); + CHECK(fs::exists(index_path)); auto config = StreamingLineReaderConfig().with_file(gz_path).with_line_range( diff --git a/tests/utilities/indexer/test_index_builder.cpp b/tests/utilities/indexer/test_index_builder.cpp index 4a066748..ce0f4432 100644 --- a/tests/utilities/indexer/test_index_builder.cpp +++ b/tests/utilities/indexer/test_index_builder.cpp @@ -53,7 +53,7 @@ TEST_SUITE("IndexBuilder") { CHECK(result.success); CHECK_FALSE(result.was_skipped); - CHECK(fs::exists(result.idx_path)); + CHECK(fs::exists(result.index_path)); } TEST_CASE("BloomVisitor direct test") { @@ -105,9 +105,9 @@ TEST_SUITE("IndexBuilder") { }); REQUIRE(result.success); - REQUIRE(fs::exists(result.idx_path)); + REQUIRE(fs::exists(result.index_path)); - IndexDatabase db(result.idx_path); + IndexDatabase db(result.index_path); int fid = db.get_file_info_id(internal::get_logical_path(result.file_path)); REQUIRE(fid >= 0); @@ -134,9 +134,9 @@ TEST_SUITE("IndexBuilder") { }); REQUIRE(result.success); - REQUIRE(fs::exists(result.idx_path)); + REQUIRE(fs::exists(result.index_path)); - IndexDatabase db(result.idx_path); + IndexDatabase db(result.index_path); int fid = db.get_file_info_id(internal::get_logical_path(result.file_path)); REQUIRE(fid >= 0); @@ -163,9 +163,9 @@ TEST_SUITE("IndexBuilder") { }); REQUIRE(result.success); - REQUIRE(fs::exists(result.idx_path)); + REQUIRE(fs::exists(result.index_path)); - IndexDatabase db(result.idx_path); + IndexDatabase db(result.index_path); int fid = db.get_file_info_id(internal::get_logical_path(result.file_path)); REQUIRE(fid >= 0); @@ -297,7 +297,7 @@ TEST_SUITE("IndexBuilder") { // Verify no bloom data yet { - IndexDatabase db(r1.idx_path); + IndexDatabase db(r1.index_path); int fid = db.get_file_info_id(internal::get_logical_path(gz_file)); CHECK(fid >= 0); CHECK_FALSE(db.has_bloom_data(fid)); @@ -323,7 +323,7 @@ TEST_SUITE("IndexBuilder") { // Verify bloom data now exists { - IndexDatabase db(r2.idx_path); + IndexDatabase db(r2.index_path); int fid = db.get_file_info_id(internal::get_logical_path(gz_file)); CHECK(fid >= 0); CHECK(db.has_bloom_data(fid)); @@ -352,7 +352,7 @@ TEST_SUITE("IndexBuilder") { REQUIRE(r1.success); { - IndexDatabase db(r1.idx_path); + IndexDatabase db(r1.index_path); int fid = db.get_file_info_id(internal::get_logical_path(gz_file)); CHECK(db.has_bloom_data(fid)); CHECK_FALSE(db.has_manifest_data(fid)); @@ -378,7 +378,7 @@ TEST_SUITE("IndexBuilder") { // Verify both bloom and manifest exist { - IndexDatabase db(r2.idx_path); + IndexDatabase db(r2.index_path); int fid = db.get_file_info_id(internal::get_logical_path(gz_file)); CHECK(db.has_bloom_data(fid)); CHECK(db.has_manifest_data(fid)); diff --git a/tests/utilities/indexer/test_index_database.cpp b/tests/utilities/indexer/test_index_database.cpp index 914ce5f7..b8345791 100644 --- a/tests/utilities/indexer/test_index_database.cpp +++ b/tests/utilities/indexer/test_index_database.cpp @@ -1,620 +1,98 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN -#include +#include #include #include -#include #include -#include -#include -#include #include -#include -#include #include namespace fs = std::filesystem; using dftracer::utils::utilities::indexer::IndexDatabase; -namespace { - -bool table_exists(sqlite3* db, const char* name) { - sqlite3_stmt* stmt = nullptr; - sqlite3_prepare_v2( - db, "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?;", -1, - &stmt, nullptr); - sqlite3_bind_text(stmt, 1, name, -1, SQLITE_STATIC); - bool found = sqlite3_step(stmt) == SQLITE_ROW; - sqlite3_finalize(stmt); - return found; -} - -int row_count(sqlite3* db, const char* table) { - std::string sql = std::string("SELECT count(*) FROM ") + table + ";"; - sqlite3_stmt* stmt = nullptr; - sqlite3_prepare_v2(db, sql.c_str(), -1, &stmt, nullptr); - int count = 0; - if (sqlite3_step(stmt) == SQLITE_ROW) count = sqlite3_column_int(stmt, 0); - sqlite3_finalize(stmt); - return count; -} - -} // namespace - TEST_SUITE("IndexDatabase") { - TEST_CASE("Create and open database") { - auto path = dft_utils_test::make_unique_test_path("idx_create"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - CHECK(db.db() != nullptr); - } - - CHECK(fs::exists(db_path)); - fs::remove(db_path); - } - - TEST_CASE("init_base_schema creates tables") { - auto path = dft_utils_test::make_unique_test_path("idx_base"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - CHECK_NOTHROW(db.init_base_schema()); - - CHECK(table_exists(db.db(), "files")); - CHECK(table_exists(db.db(), "checkpoints")); - CHECK(table_exists(db.db(), "metadata")); - } - - fs::remove(db_path); - } - - TEST_CASE("init_bloom_schema creates tables") { - auto path = dft_utils_test::make_unique_test_path("idx_bloom"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - CHECK_NOTHROW(db.init_bloom_schema()); - - CHECK(table_exists(db.db(), "chunk_bloom_filters")); - CHECK(table_exists(db.db(), "file_bloom_filters")); - CHECK(table_exists(db.db(), "chunk_statistics")); - CHECK(table_exists(db.db(), "hash_resolutions")); - CHECK(table_exists(db.db(), "index_dimensions")); - } - - fs::remove(db_path); - } - - TEST_CASE("init_manifest_schema creates tables") { - auto path = dft_utils_test::make_unique_test_path("idx_manifest"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - CHECK_NOTHROW(db.init_manifest_schema()); - - CHECK(table_exists(db.db(), "checkpoint_event_ranges")); - CHECK(table_exists(db.db(), "checkpoint_metadata_lines")); - } - - fs::remove(db_path); - } - - TEST_CASE("Additive schema — bloom without base") { - auto path = dft_utils_test::make_unique_test_path("idx_bloom_only"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - CHECK_NOTHROW(db.init_bloom_schema()); - CHECK(table_exists(db.db(), "chunk_bloom_filters")); - CHECK(table_exists(db.db(), "file_bloom_filters")); - } - - fs::remove(db_path); - } - - TEST_CASE("get_or_create_file_info") { - auto path = dft_utils_test::make_unique_test_path("idx_file_info"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - - SUBCASE("Insert returns a positive id") { - int id = - db.get_or_create_file_info("/trace/foo.pfw.gz", 0xDEAD); - CHECK(id > 0); - } - - SUBCASE("Same path and hash returns same id") { - int id1 = - db.get_or_create_file_info("/trace/bar.pfw.gz", 0xBEEF); - int id2 = - db.get_or_create_file_info("/trace/bar.pfw.gz", 0xBEEF); - CHECK(id1 == id2); - } - - SUBCASE("Hash mismatch re-inserts") { - int id1 = - db.get_or_create_file_info("/trace/baz.pfw.gz", 0x1111); - int id2 = - db.get_or_create_file_info("/trace/baz.pfw.gz", 0x2222); - CHECK(id2 > 0); - // id may or may not equal id1 — SQLite can reuse rowids - (void)id1; - } - } - - fs::remove(db_path); - } - - TEST_CASE("get_file_info_id returns -1 for unknown path") { - auto path = dft_utils_test::make_unique_test_path("idx_unknown"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - - CHECK(db.get_file_info_id("/nonexistent/path.pfw.gz") == -1); - } - - fs::remove(db_path); - } - - TEST_CASE("get_file_info_id returns correct id after insert") { - auto path = dft_utils_test::make_unique_test_path("idx_lookup"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - - int inserted = - db.get_or_create_file_info("/trace/lookup.pfw.gz", 0xABCD); - int looked_up = db.get_file_info_id("/trace/lookup.pfw.gz"); - CHECK(inserted == looked_up); - } - - fs::remove(db_path); - } - - TEST_CASE("has_bloom_data returns false when no data") { - auto path = dft_utils_test::make_unique_test_path("idx_bloom_empty"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); - - CHECK_FALSE(db.has_bloom_data(1)); - } - - fs::remove(db_path); - } - - TEST_CASE("has_bloom_data returns true after insert") { - auto path = dft_utils_test::make_unique_test_path("idx_bloom_data"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); - - int file_id = - db.get_or_create_file_info("/trace/bloom.pfw.gz", 0x1234); - - const char* sql = - "INSERT INTO chunk_bloom_filters" - "(file_info_id, checkpoint_idx, dimension, bloom_data," - " num_entries)" - " VALUES(?, 0, 'name', X'DEADBEEF', 1);"; - sqlite3_stmt* stmt = nullptr; - sqlite3_prepare_v2(db.db(), sql, -1, &stmt, nullptr); - sqlite3_bind_int(stmt, 1, file_id); - REQUIRE(sqlite3_step(stmt) == SQLITE_DONE); - sqlite3_finalize(stmt); - - CHECK(db.has_bloom_data(file_id)); - CHECK_FALSE(db.has_bloom_data(file_id + 999)); - } - - fs::remove(db_path); - } - - TEST_CASE("has_manifest_data returns false when no data") { - auto path = dft_utils_test::make_unique_test_path("idx_manifest_empty"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_manifest_schema(); - - CHECK_FALSE(db.has_manifest_data(1)); - } - - fs::remove(db_path); - } - - TEST_CASE("has_manifest_data returns true after insert") { - auto path = dft_utils_test::make_unique_test_path("idx_manifest_data"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_manifest_schema(); - - int file_id = - db.get_or_create_file_info("/trace/manifest.pfw.gz", 0x5678); - - const char* sql = - "INSERT INTO checkpoint_event_ranges" - "(file_info_id, checkpoint_idx, cat, name," - " line_numbers, event_count)" - " VALUES(?, 0, 'cat', 'ev', X'01', 1);"; - sqlite3_stmt* stmt = nullptr; - sqlite3_prepare_v2(db.db(), sql, -1, &stmt, nullptr); - sqlite3_bind_int(stmt, 1, file_id); - REQUIRE(sqlite3_step(stmt) == SQLITE_DONE); - sqlite3_finalize(stmt); - - CHECK(db.has_manifest_data(file_id)); - CHECK_FALSE(db.has_manifest_data(file_id + 999)); - } - - fs::remove(db_path); - } - - TEST_CASE("Transaction commit") { - auto path = dft_utils_test::make_unique_test_path("idx_txn"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - - CHECK_NOTHROW(db.begin_transaction()); - db.get_or_create_file_info("/trace/txn_a.pfw.gz", 0xAAAA); - db.get_or_create_file_info("/trace/txn_b.pfw.gz", 0xBBBB); - CHECK_NOTHROW(db.commit_transaction()); - - CHECK(row_count(db.db(), "files") == 2); - CHECK(db.get_file_info_id("/trace/txn_a.pfw.gz") > 0); - CHECK(db.get_file_info_id("/trace/txn_b.pfw.gz") > 0); - } - - fs::remove(db_path); - } + TEST_CASE("normalizes legacy .idx-style input to root-local .dftindex") { + auto root = dft_utils_test::make_unique_test_path("idx_root"); + fs::create_directories(root); + auto legacy_like = (root / "trace.pfw.gz.idx").string(); - TEST_CASE("Move semantics") { - auto path = dft_utils_test::make_unique_test_path("idx_move"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase original(db_path); - original.init_base_schema(); - int id = - original.get_or_create_file_info("/trace/move.pfw.gz", 0x9999); - - IndexDatabase moved(std::move(original)); - - REQUIRE(moved.db() != nullptr); - CHECK(moved.get_file_info_id("/trace/move.pfw.gz") == id); - } - - fs::remove(db_path); - } -} - -TEST_SUITE("IndexDatabase - Bloom wrapper methods") { - TEST_CASE("insert and query chunk bloom filter (span overload)") { - auto path = dft_utils_test::make_unique_test_path("idx_bloom_wrap"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); - - int fid = db.get_or_create_file_info("/trace/wrap.pfw.gz", 0x1234); - - std::vector blob = {0xDE, 0xAD, 0xBE, 0xEF}; - db.insert_chunk_bloom_filter(fid, 0, "name", std::span(blob), 42); - - auto results = db.query_chunk_bloom_filters(fid, "name"); - REQUIRE(results.size() == 1); - CHECK(results[0].checkpoint_idx == 0); - CHECK(results[0].bloom_data == blob); - CHECK(results[0].num_entries == 42); - } - - fs::remove(db_path); - } - - TEST_CASE("insert and query chunk bloom filter (void* overload)") { - auto path = dft_utils_test::make_unique_test_path("idx_bloom_wrap_raw"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); - - int fid = db.get_or_create_file_info("/trace/raw.pfw.gz", 0x5678); - - std::vector blob = {0xCA, 0xFE}; - db.insert_chunk_bloom_filter(fid, 1, "fhash", blob.data(), - static_cast(blob.size()), 10); - - auto results = db.query_chunk_bloom_filters(fid, "fhash"); - REQUIRE(results.size() == 1); - CHECK(results[0].checkpoint_idx == 1); - CHECK(results[0].bloom_data == blob); - CHECK(results[0].num_entries == 10); - } - - fs::remove(db_path); + IndexDatabase db(legacy_like); + CHECK(fs::exists(root / ".dftindex")); } - TEST_CASE("insert and query file bloom filter") { - auto path = - dft_utils_test::make_unique_test_path("idx_file_bloom_wrap"); - auto db_path = path.string() + ".idx"; + TEST_CASE("file registry is shared within one .dftindex root") { + auto root = dft_utils_test::make_unique_test_path("idx_shared"); + fs::create_directories(root); - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); + IndexDatabase db1((root / ".dftindex").string()); + IndexDatabase db2((root / "other-name.idx").string()); - int fid = - db.get_or_create_file_info("/trace/fbloom.pfw.gz", 0xABCD); + db1.init_base_schema(); + db2.init_base_schema(); - std::vector blob = {0x11, 0x22, 0x33}; - db.insert_file_bloom_filter(fid, "name", std::span(blob), 99); + int id1 = db1.get_or_create_file_info("a.pfw.gz", 0x1111); + int id2 = db2.get_file_info_id("a.pfw.gz"); - auto result = db.query_file_bloom_filter(fid, "name"); - REQUIRE(result.has_value()); - CHECK(result->bloom_data == blob); - CHECK(result->num_entries == 99); - } - - fs::remove(db_path); - } - - TEST_CASE("insert and query hash resolution") { - auto path = dft_utils_test::make_unique_test_path("idx_hash_res_wrap"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); - - int fid = db.get_or_create_file_info("/trace/hres.pfw.gz", 0xFACE); - - db.insert_hash_resolution(fid, "fhash", "abc123", "/path/to/file"); - - auto resolved = db.query_resolved_by_hash("fhash", "abc123"); - REQUIRE(resolved.has_value()); - CHECK(resolved.value() == "/path/to/file"); - - auto not_found = db.query_resolved_by_hash("fhash", "nonexistent"); - CHECK_FALSE(not_found.has_value()); - } - - fs::remove(db_path); + CHECK(id1 > 0); + CHECK(id1 == id2); } - TEST_CASE("insert and query index dimensions") { - auto path = dft_utils_test::make_unique_test_path("idx_dim_wrap"); - auto db_path = path.string() + ".idx"; + TEST_CASE("rebuild clears per-file bloom and manifest data before reuse") { + auto root = dft_utils_test::make_unique_test_path("idx_rebuild"); + fs::create_directories(root); - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); + IndexDatabase db((root / ".dftindex").string()); + db.init_base_schema(); + db.init_bloom_schema(); + db.init_manifest_schema(); - int fid = db.get_or_create_file_info("/trace/dim.pfw.gz", 0xBBBB); + const int file_id = db.get_or_create_file_info("trace.pfw.gz", 0xAAAA); - db.insert_index_dimension(fid, "name"); - db.insert_index_dimension(fid, "fhash"); + std::vector blob = {0xDE, 0xAD, 0xBE, 0xEF}; + db.insert_chunk_bloom_filter(file_id, 0, "name", std::span(blob), 4); + db.insert_file_bloom_filter(file_id, "name", std::span(blob), 4); + db.insert_index_dimension(file_id, "name"); + db.insert_hash_resolution(file_id, "fhash", "hashA", "resolvedA"); + db.insert_event_range(file_id, 0, "POSIX", "read", + std::vector{1, 2, 3}); + db.insert_metadata_lines(file_id, 0, "HH", + std::vector{0, 4}); - auto dims = db.query_index_dimensions(fid); - CHECK(dims.size() == 2); + CHECK(db.has_bloom_data(file_id)); + CHECK(db.has_manifest_data(file_id)); + CHECK(db.query_file_bloom_filter(file_id, "name").has_value()); + CHECK(db.query_resolved_by_hash("fhash", "hashA").has_value()); - CHECK(db.has_index_dimension(fid, "name")); - CHECK(db.has_index_dimension(fid, "fhash")); - CHECK_FALSE(db.has_index_dimension(fid, "nonexistent")); - } + const int rebuilt_id = + db.get_or_create_file_info("trace.pfw.gz", 0xBBBB); + CHECK(rebuilt_id == file_id); - fs::remove(db_path); + CHECK_FALSE(db.has_bloom_data(file_id)); + CHECK_FALSE(db.has_manifest_data(file_id)); + CHECK_FALSE(db.query_file_bloom_filter(file_id, "name").has_value()); + CHECK(db.query_chunk_bloom_filters(file_id, "name").empty()); + CHECK(db.query_event_ranges(file_id).empty()); + CHECK(db.query_metadata_lines(file_id).empty()); + CHECK_FALSE(db.query_resolved_by_hash("fhash", "hashA").has_value()); } - TEST_CASE("insert and query chunk statistics") { - auto path = dft_utils_test::make_unique_test_path("idx_stats_wrap"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); - - int fid = db.get_or_create_file_info("/trace/stats.pfw.gz", 0xCCCC); - - using ChunkStatistics = dftracer::utils::utilities::composites:: - dft::indexing::ChunkStatistics; - ChunkStatistics stats; - stats.total_events = 100; - stats.min_timestamp_us = 1000; - stats.max_timestamp_us = 5000; - - db.insert_chunk_statistics(fid, 0, stats); - - auto results = db.query_chunk_statistics(fid); - REQUIRE(results.size() == 1); - CHECK(results[0].checkpoint_idx == 0); - CHECK(results[0].stats.total_events == 100); - } - - fs::remove(db_path); - } - - TEST_CASE("delete operations") { - auto path = dft_utils_test::make_unique_test_path("idx_delete_wrap"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); - - int fid = db.get_or_create_file_info("/trace/del.pfw.gz", 0xDDDD); - - std::vector blob = {0x01}; - db.insert_chunk_bloom_filter(fid, 0, "name", std::span(blob), 1); - db.insert_file_bloom_filter(fid, "name", std::span(blob), 1); - db.insert_hash_resolution(fid, "name", "h1", "v1"); - - db.delete_chunk_bloom_filters(fid, "name"); - CHECK(db.query_chunk_bloom_filters(fid, "name").empty()); - - db.delete_file_bloom_filter(fid, "name"); - CHECK_FALSE(db.query_file_bloom_filter(fid, "name").has_value()); - - db.delete_hash_resolutions(fid); - CHECK_FALSE(db.query_resolved_by_hash("name", "h1").has_value()); - } - - fs::remove(db_path); - } - - TEST_CASE("string_view accepts std::string and const char*") { - auto path = dft_utils_test::make_unique_test_path("idx_sv_compat"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_bloom_schema(); - - int fid = db.get_or_create_file_info("/trace/sv.pfw.gz", 0xEEEE); - - // const char* - std::vector blob = {0x01}; - db.insert_chunk_bloom_filter(fid, 0, "name", std::span(blob), 1); - - // std::string - std::string dim = "fhash"; - db.insert_chunk_bloom_filter(fid, 1, dim, std::span(blob), 2); - - // std::string_view - std::string_view sv_dim = "hhash"; - db.insert_chunk_bloom_filter(fid, 2, sv_dim, std::span(blob), 3); - - CHECK(db.query_chunk_bloom_filters(fid, "name").size() == 1); - CHECK(db.query_chunk_bloom_filters(fid, dim).size() == 1); - CHECK(db.query_chunk_bloom_filters(fid, sv_dim).size() == 1); - } - - fs::remove(db_path); - } -} - -TEST_SUITE("IndexDatabase - Manifest wrapper methods") { - TEST_CASE("insert and query event ranges") { - auto path = - dft_utils_test::make_unique_test_path("idx_event_range_wrap"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_manifest_schema(); - - int fid = db.get_or_create_file_info("/trace/ev.pfw.gz", 0x1111); - - std::vector lines = {10, 20, 30}; - - // vector overload - db.insert_event_range(fid, 0, "cat1", "event1", lines); - - // span overload - std::vector lines2 = {40, 50}; - db.insert_event_range(fid, 1, "cat2", "event2", std::span(lines2)); - - auto results = db.query_event_ranges(fid); - CHECK(results.size() == 2); - - auto ckpt0 = db.query_event_ranges_for_checkpoint(fid, 0); - REQUIRE(ckpt0.size() == 1); - CHECK(ckpt0[0].cat == "cat1"); - CHECK(ckpt0[0].name == "event1"); - } - - fs::remove(db_path); - } - - TEST_CASE("insert and query metadata lines") { - auto path = - dft_utils_test::make_unique_test_path("idx_meta_lines_wrap"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_manifest_schema(); - - int fid = db.get_or_create_file_info("/trace/meta.pfw.gz", 0x2222); - - std::vector lines = {5, 15, 25}; - db.insert_metadata_lines(fid, 0, "traceEvents", lines); - - auto results = db.query_metadata_lines(fid); - REQUIRE(results.size() == 1); - CHECK(results[0].meta_type == "traceEvents"); - - auto ckpt0 = db.query_metadata_lines_for_checkpoint(fid, 0); - CHECK(ckpt0.size() == 1); - } - - fs::remove(db_path); - } - - TEST_CASE("delete event ranges and metadata lines") { - auto path = - dft_utils_test::make_unique_test_path("idx_manifest_del_wrap"); - auto db_path = path.string() + ".idx"; - - { - IndexDatabase db(db_path); - db.init_base_schema(); - db.init_manifest_schema(); - - int fid = db.get_or_create_file_info("/trace/mdel.pfw.gz", 0x3333); + TEST_CASE("rollback discards transactional writes") { + auto root = dft_utils_test::make_unique_test_path("idx_rollback"); + fs::create_directories(root); - std::vector lines = {1, 2, 3}; - db.insert_event_range(fid, 0, "cat", "name", lines); - db.insert_metadata_lines(fid, 0, "meta", lines); + IndexDatabase db((root / ".dftindex").string()); + db.init_base_schema(); + db.init_bloom_schema(); - db.delete_event_ranges(fid); - CHECK(db.query_event_ranges(fid).empty()); + const int file_id = db.get_or_create_file_info("trace.pfw.gz", 0xAAAA); + std::vector blob = {0xAB, 0xCD}; - db.delete_metadata_lines(fid); - CHECK(db.query_metadata_lines(fid).empty()); - } + db.begin_transaction(); + db.insert_file_bloom_filter(file_id, "name", std::span(blob), 2); + db.insert_hash_resolution(file_id, "fhash", "hashA", "resolvedA"); + db.rollback_transaction(); - fs::remove(db_path); + CHECK_FALSE(db.query_file_bloom_filter(file_id, "name").has_value()); + CHECK_FALSE(db.query_resolved_by_hash("fhash", "hashA").has_value()); } } diff --git a/tests/utilities/indexer/test_provenance_database.cpp b/tests/utilities/indexer/test_provenance_database.cpp index d5ca015f..5686b01d 100644 --- a/tests/utilities/indexer/test_provenance_database.cpp +++ b/tests/utilities/indexer/test_provenance_database.cpp @@ -1,136 +1,164 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include -#include #include #include -#include #include -#include - +namespace fs = std::filesystem; using namespace dftracer::utils::utilities::indexer; -using dftracer::utils::sqlite::SqliteStmt; - -static bool table_exists(sqlite3* db, const std::string& table_name) { - SqliteStmt stmt( - db, "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?;"); - stmt.bind_text(1, table_name); - return sqlite3_step(stmt) == SQLITE_ROW; -} TEST_SUITE("ProvenanceDatabase") { - TEST_CASE("Create and open database") { - auto path = - dft_utils_test::make_unique_test_path("provdb_create").string() + - ".pidx"; - CHECK_NOTHROW(ProvenanceDatabase db(path)); - CHECK(fs::exists(path)); - fs::remove(path); - } + TEST_CASE("uses the same root-local .dftindex path") { + auto root = dft_utils_test::make_unique_test_path("prov_root"); + fs::create_directories(root); - TEST_CASE("init_schema creates tables") { - auto path = - dft_utils_test::make_unique_test_path("provdb_schema").string() + - ".pidx"; - ProvenanceDatabase db(path); - CHECK_NOTHROW(db.init_schema()); - - sqlite3* raw = db.db().get(); - CHECK(table_exists(raw, "file_info")); - CHECK(table_exists(raw, "provenance_info")); - CHECK(table_exists(raw, "provenance_sources")); - CHECK(table_exists(raw, "provenance_group")); - CHECK(table_exists(raw, "provenance_segments")); - - fs::remove(path); + auto resolved = + determine_provenance_index_path((root / "trace.pfw.gz").string()); + CHECK(resolved == (root / ".dftindex").string()); + + ProvenanceDatabase db(resolved); + CHECK(fs::exists(root / ".dftindex")); } - TEST_CASE("get_or_create_file_info") { - auto path = - dft_utils_test::make_unique_test_path("provdb_file_info").string() + - ".pidx"; - ProvenanceDatabase db(path); + TEST_CASE("stores and queries provenance records in shared DB") { + auto root = dft_utils_test::make_unique_test_path("prov_records"); + fs::create_directories(root); + + ProvenanceDatabase db((root / ".dftindex").string()); db.init_schema(); - SUBCASE("insert returns valid id") { - int id = db.get_or_create_file_info("/data/trace.pfw.gz", 0xDEAD); - CHECK(id >= 1); - } - - SUBCASE("same path and hash returns same id") { - int id1 = db.get_or_create_file_info("/data/trace.pfw.gz", 0xBEEF); - int id2 = db.get_or_create_file_info("/data/trace.pfw.gz", 0xBEEF); - CHECK(id1 == id2); - } - - SUBCASE("same path different hash replaces row") { - int id1 = db.get_or_create_file_info("/data/other.pfw.gz", 0xAAAA); - int id2 = db.get_or_create_file_info("/data/other.pfw.gz", 0xBBBB); - CHECK(id2 >= 1); - (void)id1; - } - - SUBCASE("distinct paths get distinct ids") { - int id1 = db.get_or_create_file_info("/data/a.pfw.gz", 0x1111); - int id2 = db.get_or_create_file_info("/data/b.pfw.gz", 0x2222); - CHECK(id1 != id2); - } - - fs::remove(path); + int file_id = + db.get_or_create_file_info((root / "out.pfw.gz").string(), 0xCAFE); + CHECK(file_id > 0); + CHECK(db.get_file_info_id((root / "out.pfw.gz").string()) == file_id); + + db.insert_info(file_id, "tool", "dftracer_organize"); + db.insert_group(file_id, "group0", "cat == POSIX"); + db.insert_source(file_id, 7, "/src/a.pfw.gz", 12, "hash7"); + db.insert_segment(file_id, 7, 3, 100, 140, 9); + + auto sources = db.query_sources(file_id); + REQUIRE(sources.size() == 1); + CHECK(sources[0].source_idx == 7); + CHECK(sources[0].path == "/src/a.pfw.gz"); + CHECK(sources[0].num_checkpoints == 12); + CHECK(sources[0].event_hash == "hash7"); + + auto segments = db.query_segments(file_id, 7); + REQUIRE(segments.size() == 1); + CHECK(segments[0].source_checkpoint == 3); + CHECK(segments[0].output_line_start == 100); + CHECK(segments[0].output_line_end == 140); + CHECK(segments[0].event_count == 9); + + CHECK(db.query_info(file_id, "tool") == "dftracer_organize"); + CHECK(db.query_group_name(file_id) == "group0"); + CHECK(db.query_group_predicate(file_id) == "cat == POSIX"); } - TEST_CASE("get_file_info_id returns -1 for unknown") { - auto path = - dft_utils_test::make_unique_test_path("provdb_unknown").string() + - ".pidx"; - ProvenanceDatabase db(path); - db.init_schema(); + TEST_CASE("keeps provenance for multiple outputs in one shared root") { + auto root = dft_utils_test::make_unique_test_path("prov_multi"); + fs::create_directories(root); - CHECK(db.get_file_info_id("/nonexistent/path.pfw.gz") == -1); + ProvenanceDatabase db((root / ".dftindex").string()); + db.init_schema(); - fs::remove(path); + const auto out_a = (root / "io.pfw.gz").string(); + const auto out_b = (root / "compute.pfw.gz").string(); + + const int file_a = db.get_or_create_file_info(out_a, 0xA001); + const int file_b = db.get_or_create_file_info(out_b, 0xB002); + CHECK(file_a > 0); + CHECK(file_b > 0); + CHECK(file_a != file_b); + + db.begin_transaction(); + db.insert_group(file_a, "io", R"(cat == "POSIX")"); + db.insert_source(file_a, 0, "/src/trace0.pfw.gz", 3, "ha"); + db.insert_segment(file_a, 0, 1, 0, 5, 3); + + db.insert_group(file_b, "compute", R"(cat == "APP")"); + db.insert_source(file_b, 1, "/src/trace1.pfw.gz", 2, "hb"); + db.insert_segment(file_b, 1, 0, 0, 3, 1); + db.commit_transaction(); + + CHECK(db.get_file_info_id(out_a) == file_a); + CHECK(db.get_file_info_id(out_b) == file_b); + + CHECK(db.query_group_name(file_a) == "io"); + CHECK(db.query_group_name(file_b) == "compute"); + + auto segments_a = db.query_all_segments(file_a); + auto segments_b = db.query_all_segments(file_b); + REQUIRE(segments_a.size() == 1); + REQUIRE(segments_b.size() == 1); + CHECK(segments_a[0].event_count == 3); + CHECK(segments_b[0].event_count == 1); } - TEST_CASE("Transaction commit") { - auto path = - dft_utils_test::make_unique_test_path("provdb_txn").string() + - ".pidx"; - ProvenanceDatabase db(path); - db.init_schema(); - - CHECK_NOTHROW(db.begin_transaction()); - int id = db.get_or_create_file_info("/data/txn.pfw.gz", 0xCAFE); - CHECK_NOTHROW(db.commit_transaction()); + TEST_CASE("rebuild-style writes overwrite provenance for the same output") { + auto root = dft_utils_test::make_unique_test_path("prov_rebuild"); + fs::create_directories(root); - CHECK(id >= 1); - CHECK(db.get_file_info_id("/data/txn.pfw.gz") == id); + ProvenanceDatabase db((root / ".dftindex").string()); + db.init_schema(); - fs::remove(path); + const auto out = (root / "group.pfw.gz").string(); + + const int original_id = db.get_or_create_file_info(out, 0x1111); + db.begin_transaction(); + db.insert_info(original_id, "tool", "dftracer_organize"); + db.insert_group(original_id, "io", R"(cat == "POSIX")"); + db.insert_source(original_id, 0, "/src/trace0.pfw.gz", 4, "old"); + db.insert_segment(original_id, 0, 0, 0, 4, 2); + db.commit_transaction(); + + const int rebuilt_id = db.get_or_create_file_info(out, 0x2222); + CHECK(rebuilt_id == original_id); + + db.begin_transaction(); + db.insert_info(rebuilt_id, "tool", "dftracer_organize_v2"); + db.insert_group(rebuilt_id, "io", R"(cat == "MPI")"); + db.insert_source(rebuilt_id, 0, "/src/trace0.pfw.gz", 8, "new"); + db.insert_segment(rebuilt_id, 0, 0, 10, 18, 5); + db.commit_transaction(); + + CHECK(db.query_info(rebuilt_id, "tool") == "dftracer_organize_v2"); + CHECK(db.query_group_predicate(rebuilt_id) == R"(cat == "MPI")"); + + auto sources = db.query_sources(rebuilt_id); + REQUIRE(sources.size() == 1); + CHECK(sources[0].num_checkpoints == 8); + CHECK(sources[0].event_hash == "new"); + + auto segments = db.query_segments(rebuilt_id, 0); + REQUIRE(segments.size() == 1); + CHECK(segments[0].output_line_start == 10); + CHECK(segments[0].output_line_end == 18); + CHECK(segments[0].event_count == 5); } - TEST_CASE("determine_provenance_index_path - empty index_dir") { - SUBCASE("plain path gets .pidx suffix") { - auto result = determine_provenance_index_path("/data/trace.pfw.gz"); - CHECK(result == "/data/trace.pfw.gz.pidx"); - } + TEST_CASE("rollback discards provenance writes") { + auto root = dft_utils_test::make_unique_test_path("prov_rollback"); + fs::create_directories(root); - SUBCASE("path without extension gets .pidx suffix") { - auto result = determine_provenance_index_path("/data/trace"); - CHECK(result == "/data/trace.pidx"); - } - } + ProvenanceDatabase db((root / ".dftindex").string()); + db.init_schema(); - TEST_CASE("determine_provenance_index_path - with index_dir") { - SUBCASE("places filename.pidx under index_dir") { - auto result = - determine_provenance_index_path("/data/trace.pfw.gz", "/idx"); - CHECK(result == "/idx/trace.pfw.gz.pidx"); - } - - SUBCASE("nested source path uses only filename") { - auto result = determine_provenance_index_path( - "/deep/nested/dir/run.pfw.gz", "/scratch/indices"); - CHECK(result == "/scratch/indices/run.pfw.gz.pidx"); - } + const int file_id = + db.get_or_create_file_info((root / "out.pfw.gz").string(), 0xCAFE); + + db.begin_transaction(); + db.insert_info(file_id, "tool", "dftracer_organize"); + db.insert_group(file_id, "group0", "cat == POSIX"); + db.insert_source(file_id, 7, "/src/a.pfw.gz", 12, "hash7"); + db.insert_segment(file_id, 7, 3, 100, 140, 9); + db.rollback_transaction(); + + CHECK(db.query_info(file_id, "tool").empty()); + CHECK(db.query_group_name(file_id).empty()); + CHECK(db.query_group_predicate(file_id).empty()); + CHECK(db.query_sources(file_id).empty()); + CHECK(db.query_segments(file_id, 7).empty()); } } diff --git a/tests/utilities/indexer/test_rocksdb_storage.cpp b/tests/utilities/indexer/test_rocksdb_storage.cpp new file mode 100644 index 00000000..48dfeb97 --- /dev/null +++ b/tests/utilities/indexer/test_rocksdb_storage.cpp @@ -0,0 +1,224 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace fs = std::filesystem; +using dftracer::utils::rocksdb::KeyBuilder; +using dftracer::utils::rocksdb::KeyCodec; +using dftracer::utils::rocksdb::RocksDatabase; +using dftracer::utils::rocksdb::RocksDBManager; + +TEST_SUITE("RocksDBStorage") { + TEST_CASE("key codec round-trips big-endian integers") { + const std::uint32_t v32 = 0x10203040U; + const std::uint64_t v64 = 0x0102030405060708ULL; + + CHECK(KeyCodec::decode_be32(KeyCodec::encode_be32(v32)) == v32); + CHECK(KeyCodec::decode_be64(KeyCodec::encode_be64(v64)) == v64); + + KeyBuilder builder; + builder.append_tag("f|").append_be32(17).append_separator().append_be64( + 9); + auto built = builder.build(); + CHECK(built.size() == 2 + 4 + 1 + 8); + } + + TEST_CASE("basic put/get works across column families") { + auto root = dft_utils_test::make_unique_test_path("rocksdb_put_get"); + fs::create_directories(root); + + RocksDatabase db((root / ".dftindex").string()); + + CHECK(db.is_open()); + CHECK(db.put("hello", "world").ok()); + CHECK(db.put("k1", "v1", "provenance").ok()); + + std::string value; + CHECK(db.get("hello", &value).ok()); + CHECK(value == "world"); + + CHECK(db.get("k1", &value, "provenance").ok()); + CHECK(value == "v1"); + } + + TEST_CASE("manager reuses one live instance per db path") { + auto root = dft_utils_test::make_unique_test_path("rocksdb_manager"); + fs::create_directories(root); + + auto path = (root / ".dftindex").string(); + auto& manager = RocksDBManager::instance(); + + auto rw = manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); + REQUIRE(rw != nullptr); + REQUIRE(rw->is_open()); + CHECK_FALSE(rw->is_read_only()); + + auto rw_again = + manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); + CHECK(rw_again == rw); + + auto ro = manager.get_or_open(path, RocksDatabase::OpenMode::ReadOnly); + CHECK(ro == rw); + CHECK_FALSE(ro->is_read_only()); + } + + TEST_CASE("manager reset drops the cached instance for one path") { + auto root = + dft_utils_test::make_unique_test_path("rocksdb_manager_reset"); + fs::create_directories(root); + + auto path = (root / ".dftindex").string(); + auto& manager = RocksDBManager::instance(); + + auto first = + manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); + REQUIRE(first != nullptr); + auto* first_raw = first.get(); + + manager.reset(path); + first.reset(); + + auto second = + manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); + REQUIRE(second != nullptr); + CHECK(second.get() != first_raw); + } + + TEST_CASE("manager shutdown clears cached instances") { + auto root = + dft_utils_test::make_unique_test_path("rocksdb_manager_shutdown"); + fs::create_directories(root); + + auto path = (root / ".dftindex").string(); + auto& manager = RocksDBManager::instance(); + + auto first = + manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); + REQUIRE(first != nullptr); + auto* first_raw = first.get(); + + manager.shutdown(); + first.reset(); + + auto second = + manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite); + REQUIRE(second != nullptr); + CHECK(second.get() != first_raw); + } + + TEST_CASE("manager rejects read-only upgrade while handle is alive") { + auto root = + dft_utils_test::make_unique_test_path("rocksdb_manager_upgrade"); + fs::create_directories(root); + + auto path = (root / ".dftindex").string(); + RocksDatabase seed(path, RocksDatabase::OpenMode::ReadWrite); + REQUIRE(seed.is_open()); + + auto& manager = RocksDBManager::instance(); + auto ro = manager.get_or_open(path, RocksDatabase::OpenMode::ReadOnly); + REQUIRE(ro != nullptr); + CHECK(ro->is_read_only()); + CHECK_THROWS_WITH_AS( + manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite), + doctest::Contains("still in use"), std::runtime_error); + } + + TEST_CASE("manager rejects read-only upgrade while handle is shared") { + auto root = dft_utils_test::make_unique_test_path( + "rocksdb_manager_upgrade_shared"); + fs::create_directories(root); + + auto path = (root / ".dftindex").string(); + RocksDatabase seed(path, RocksDatabase::OpenMode::ReadWrite); + REQUIRE(seed.is_open()); + + auto& manager = RocksDBManager::instance(); + auto ro = manager.get_or_open(path, RocksDatabase::OpenMode::ReadOnly); + REQUIRE(ro != nullptr); + CHECK(ro->is_read_only()); + + auto ro_shared = ro; + CHECK_THROWS_WITH_AS( + manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite), + doctest::Contains("still in use"), std::runtime_error); + } + + TEST_CASE("custom filesystem supports async read polling") { + auto root = dft_utils_test::make_unique_test_path("rocksdb_async_read"); + fs::create_directories(root); + + auto file_system = + dftracer::utils::rocksdb::make_dftracer_file_system(); + auto test_file = (root / "async-read.bin").string(); + + { + std::unique_ptr<::rocksdb::FSWritableFile> writable; + REQUIRE(file_system + ->NewWritableFile(test_file, ::rocksdb::FileOptions(), + &writable, nullptr) + .ok()); + const std::string payload = "abcdefghijklmnop"; + REQUIRE(writable + ->Append(::rocksdb::Slice(payload), + ::rocksdb::IOOptions(), nullptr) + .ok()); + REQUIRE(writable->Close(::rocksdb::IOOptions(), nullptr).ok()); + } + + std::unique_ptr<::rocksdb::FSRandomAccessFile> random; + REQUIRE(file_system + ->NewRandomAccessFile(test_file, ::rocksdb::FileOptions(), + &random, nullptr) + .ok()); + + int64_t supported_ops = 0; + file_system->SupportedOps(supported_ops); + CHECK((supported_ops & (1LL << ::rocksdb::FSSupportedOps::kAsyncIO)) != + 0); + + std::array scratch{}; + ::rocksdb::FSReadRequest request; + request.offset = 2; + request.len = 4; + request.scratch = scratch.data(); + + bool callback_called = false; + bool callback_status_ok = false; + std::string callback_result; + void* io_handle = nullptr; + ::rocksdb::IOHandleDeleter deleter; + + REQUIRE( + random + ->ReadAsync( + request, ::rocksdb::IOOptions(), + [&callback_called, &callback_status_ok, &callback_result]( + ::rocksdb::FSReadRequest& completed, void*) { + callback_called = true; + callback_status_ok = completed.status.ok(); + callback_result = completed.result.ToString(); + }, + nullptr, &io_handle, &deleter, nullptr) + .ok()); + REQUIRE(io_handle != nullptr); + + std::vector io_handles{io_handle}; + REQUIRE(file_system->Poll(io_handles, 1).ok()); + CHECK(callback_called); + CHECK(callback_status_ok); + CHECK(callback_result == "cdef"); + + deleter(io_handle); + } +} diff --git a/tests/utilities/indexer/test_scan_prefix.cpp b/tests/utilities/indexer/test_scan_prefix.cpp new file mode 100644 index 00000000..c5ebf575 --- /dev/null +++ b/tests/utilities/indexer/test_scan_prefix.cpp @@ -0,0 +1,123 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using dftracer::utils::utilities::indexer::internal::IndexerError; +using dftracer::utils::utilities::indexer::internal::scan_prefix_iterator; + +namespace { + +class FakeIterator final : public ::rocksdb::Iterator { + public: + FakeIterator(std::vector> entries, + ::rocksdb::Status status = ::rocksdb::Status::OK()) + : entries_(std::move(entries)), status_(std::move(status)) {} + + bool Valid() const override { + return index_ < entries_.size() && status_.ok(); + } + + void SeekToFirst() override { + index_ = entries_.empty() ? entries_.size() : 0; + } + + void SeekToLast() override { + index_ = entries_.empty() ? entries_.size() : entries_.size() - 1; + } + + void Seek(const ::rocksdb::Slice& target) override { + const auto key = target.ToString(); + index_ = 0; + while (index_ < entries_.size() && entries_[index_].first < key) { + ++index_; + } + } + + void SeekForPrev(const ::rocksdb::Slice& target) override { + const auto key = target.ToString(); + index_ = entries_.size(); + while (index_ > 0 && entries_[index_ - 1].first > key) { + --index_; + } + if (index_ > 0) { + --index_; + } + } + + void Next() override { + if (index_ < entries_.size()) { + ++index_; + } + } + + void Prev() override { + if (index_ == 0 || entries_.empty()) { + index_ = entries_.size(); + return; + } + --index_; + } + + ::rocksdb::Slice key() const override { return entries_[index_].first; } + + ::rocksdb::Slice value() const override { return entries_[index_].second; } + + ::rocksdb::Status status() const override { return status_; } + + private: + std::vector> entries_; + std::size_t index_ = 0; + ::rocksdb::Status status_; +}; + +} // namespace + +TEST_SUITE("ScanPrefix") { + TEST_CASE("iterates matching prefix entries and stops at the first miss") { + std::vector seen; + scan_prefix_iterator( + "scan failed", "ab|", + [] { + return std::make_unique( + std::vector>{ + {"aa|0", "skip"}, + {"ab|0", "v0"}, + {"ab|1", "v1"}, + {"ac|0", "stop"}, + }); + }, + [&](::rocksdb::Iterator& it) { + seen.push_back(it.key().ToString()); + }); + + REQUIRE(seen.size() == 2); + CHECK(seen[0] == "ab|0"); + CHECK(seen[1] == "ab|1"); + } + + TEST_CASE("throws IndexerError when iterator status is non-ok") { + CHECK_THROWS_AS( + scan_prefix_iterator( + "scan failed", "ab|", + [] { + return std::make_unique( + std::vector>{ + {"ab|0", "v0"}, + }, + ::rocksdb::Status::IOError( + "synthetic iterator failure")); + }, + [](::rocksdb::Iterator&) {}), + IndexerError); + } +} diff --git a/tests/utilities/reader/test_trace_reader.cpp b/tests/utilities/reader/test_trace_reader.cpp index 90e9f58b..e1a1ed0d 100644 --- a/tests/utilities/reader/test_trace_reader.cpp +++ b/tests/utilities/reader/test_trace_reader.cpp @@ -1,6 +1,7 @@ #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include #include +#include #include #include #include @@ -16,6 +17,7 @@ using namespace dftracer::utils::utilities::reader; using namespace dftracer::utils::utilities::indexer::internal; using namespace dftracer::utils::coro; +using namespace dftracer::utils::utilities::composites::dft::internal; using namespace dft_utils_test; namespace { @@ -88,13 +90,13 @@ TEST_SUITE("TraceReader") { TestEnvironment env(100); std::string gz_file = env.create_dft_test_gzip_file(100); std::string index_dir = env.get_dir(); - std::string idx_path = env.get_index_path(gz_file); + std::string index_path = env.get_index_path(gz_file); - auto indexer = - IndexerFactory::create(gz_file, idx_path, 32 * 1024 * 1024, false); + auto indexer = IndexerFactory::create(gz_file, index_path, + 32 * 1024 * 1024, false); REQUIRE(indexer != nullptr); indexer->build(); - REQUIRE(fs::exists(idx_path)); + REQUIRE(fs::exists(determine_index_path(gz_file, index_dir))); TraceReader reader({.file_path = gz_file, .index_dir = index_dir}); @@ -106,7 +108,7 @@ TEST_SUITE("TraceReader") { SUBCASE("Indexed and unindexed counts match") { // Remove the index and re-read to compare. - fs::remove(idx_path); + fs::remove_all(determine_index_path(gz_file, index_dir)); TraceReader plain_reader({.file_path = gz_file}); CHECK_FALSE(plain_reader.has_index()); auto n_plain = count_lines(plain_reader.read_lines()).get(); @@ -208,13 +210,13 @@ TEST_SUITE("TraceReader") { TestEnvironment env(100); std::string gz_file = env.create_dft_test_gzip_file(100); std::string index_dir = env.get_dir(); - std::string idx_path = env.get_index_path(gz_file); + std::string index_path = env.get_index_path(gz_file); - auto indexer = - IndexerFactory::create(gz_file, idx_path, 32 * 1024 * 1024, false); + auto indexer = IndexerFactory::create(gz_file, index_path, + 32 * 1024 * 1024, false); REQUIRE(indexer != nullptr); indexer->build(); - REQUIRE(fs::exists(idx_path)); + REQUIRE(fs::exists(determine_index_path(gz_file, index_dir))); TraceReader reader({.file_path = gz_file, .index_dir = index_dir}); CHECK(reader.has_index()); @@ -289,7 +291,7 @@ TEST_SUITE("TraceReader") { TestEnvironment env(100); std::string gz_file = env.create_dft_test_gzip_file(100); std::string index_dir = env.get_dir(); - std::string idx_path = env.get_index_path(gz_file); + std::string index_path = env.get_index_path(gz_file); TraceReader plain_reader({.file_path = gz_file}); CHECK_FALSE(plain_reader.has_index()); @@ -299,8 +301,8 @@ TEST_SUITE("TraceReader") { auto plain_chunks = count_raw_chunks(plain_reader.read_raw(single_line)).get(); - auto indexer = - IndexerFactory::create(gz_file, idx_path, 32 * 1024 * 1024, false); + auto indexer = IndexerFactory::create(gz_file, index_path, + 32 * 1024 * 1024, false); REQUIRE(indexer != nullptr); indexer->build(); @@ -438,13 +440,13 @@ TEST_SUITE("TraceReader") { TestEnvironment env(100); std::string gz_file = env.create_dft_test_gzip_file(100); std::string index_dir = env.get_dir(); - std::string idx_path = env.get_index_path(gz_file); + std::string index_path = env.get_index_path(gz_file); - auto indexer = - IndexerFactory::create(gz_file, idx_path, 32 * 1024 * 1024, false); + auto indexer = IndexerFactory::create(gz_file, index_path, + 32 * 1024 * 1024, false); REQUIRE(indexer != nullptr); indexer->build(); - REQUIRE(fs::exists(idx_path)); + REQUIRE(fs::exists(determine_index_path(gz_file, index_dir))); TraceReader reader({.file_path = gz_file, .index_dir = index_dir}); CHECK(reader.has_index());