From 53f0308f898ef14d075f4097e7bf13c91049c2a6 Mon Sep 17 00:00:00 2001 From: xiaoheng Date: Fri, 27 Feb 2026 16:24:33 +0800 Subject: [PATCH 1/6] feat: support fixed length range bitmap file index read and write --- src/paimon/CMakeLists.txt | 1 + src/paimon/common/file_index/CMakeLists.txt | 10 +- .../file_index/file_index_format_test.cpp | 49 +- .../rangebitmap/bit_slice_index_bitmap.cpp | 276 ++++++++++ .../rangebitmap/bit_slice_index_bitmap.h | 82 +++ .../file_index/rangebitmap/dictionary/chunk.h | 88 ++++ .../dictionary/chunked_dictionary.cpp | 239 +++++++++ .../dictionary/chunked_dictionary.h | 96 ++++ .../rangebitmap/dictionary/dictionary.h | 45 ++ .../dictionary/fixed_length_chunk.cpp | 127 +++++ .../dictionary/fixed_length_chunk.h | 92 ++++ .../rangebitmap/dictionary/key_factory.cpp | 122 +++++ .../rangebitmap/dictionary/key_factory.h | 179 +++++++ .../file_index/rangebitmap/range_bitmap.cpp | 302 +++++++++++ .../file_index/rangebitmap/range_bitmap.h | 102 ++++ .../rangebitmap/range_bitmap_file_index.cpp | 226 ++++++++ .../rangebitmap/range_bitmap_file_index.h | 107 ++++ .../range_bitmap_file_index_factory.cpp | 32 ++ .../range_bitmap_file_index_factory.h | 41 ++ .../range_bitmap_file_index_test.cpp | 482 ++++++++++++++++++ .../utils/literal_serialization_utils.cpp | 195 +++++++ .../utils/literal_serialization_utils.h | 48 ++ src/paimon/common/io/data_input_stream.cpp | 1 + test/inte/read_inte_with_index_test.cpp | 379 ++++++++++++++ .../append_with_rangebitmap/README | 16 + ...5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc | Bin 0 -> 1024 bytes ...3d-17fc-4031-b5bb-5e22b02fdb3b-0.orc.index | Bin 0 -> 1288 bytes ...est-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 | Bin 0 -> 2175 bytes ...ist-64353b80-fb7c-470e-972a-07d0717af717-0 | Bin 0 -> 1006 bytes ...ist-64353b80-fb7c-470e-972a-07d0717af717-1 | Bin 0 -> 1113 bytes .../append_with_rangebitmap/schema/schema-0 | 39 ++ .../append_with_rangebitmap/snapshot/EARLIEST | 1 + .../append_with_rangebitmap/snapshot/LATEST | 1 + .../snapshot/snapshot-1 | 16 + .../README | 17 + ...b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc | Bin 0 -> 1024 bytes ...1e-e3d2-4f08-9a36-726a96cde1be-0.orc.index | Bin 0 -> 1413 bytes ...est-71437504-f8ad-4b7d-be04-28203480227d-0 | Bin 0 -> 2176 bytes ...ist-72b69947-13fc-4b20-8db2-ffa7d607b38f-0 | Bin 0 -> 1006 bytes ...ist-72b69947-13fc-4b20-8db2-ffa7d607b38f-1 | Bin 0 -> 1106 bytes .../schema/schema-0 | 44 ++ .../snapshot/EARLIEST | 1 + .../snapshot/LATEST | 1 + .../snapshot/snapshot-1 | 16 + .../append_with_rangebitmap/README | 16 + ...52e2-e4b5-4807-bf92-04401ed10560-0.parquet | Bin 0 -> 1397 bytes ...4b5-4807-bf92-04401ed10560-0.parquet.index | Bin 0 -> 1288 bytes ...est-75f07296-b729-48db-aadd-17826f0aadf9-0 | Bin 0 -> 2180 bytes ...ist-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0 | Bin 0 -> 1006 bytes ...ist-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1 | Bin 0 -> 1108 bytes .../append_with_rangebitmap/schema/schema-0 | 38 ++ .../append_with_rangebitmap/snapshot/EARLIEST | 1 + .../append_with_rangebitmap/snapshot/LATEST | 1 + .../snapshot/snapshot-1 | 16 + .../README | 17 + ...3af1-afcb-4b84-b69a-ae472ba517f2-0.parquet | Bin 0 -> 1397 bytes ...fcb-4b84-b69a-ae472ba517f2-0.parquet.index | Bin 0 -> 1413 bytes ...est-90122746-0b4c-4328-8a04-576ee6b4cb83-0 | Bin 0 -> 2180 bytes ...ist-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 | Bin 0 -> 1006 bytes ...ist-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 | Bin 0 -> 1108 bytes .../schema/schema-0 | 43 ++ .../snapshot/EARLIEST | 1 + .../snapshot/LATEST | 1 + .../snapshot/snapshot-1 | 16 + 64 files changed, 3620 insertions(+), 3 deletions(-) create mode 100644 src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp create mode 100644 src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h create mode 100644 src/paimon/common/file_index/rangebitmap/dictionary/chunk.h create mode 100644 src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp create mode 100644 src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h create mode 100644 src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h create mode 100644 src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp create mode 100644 src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h create mode 100644 src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp create mode 100644 src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h create mode 100644 src/paimon/common/file_index/rangebitmap/range_bitmap.cpp create mode 100644 src/paimon/common/file_index/rangebitmap/range_bitmap.h create mode 100644 src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp create mode 100644 src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h create mode 100644 src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp create mode 100644 src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h create mode 100644 src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp create mode 100644 src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp create mode 100644 src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc.index create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0 create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-1 create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST create mode 100644 test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc.index create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-71437504-f8ad-4b7d-be04-28203480227d-0 create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0 create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1 create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST create mode 100644 test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet.index create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-75f07296-b729-48db-aadd-17826f0aadf9-0 create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0 create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1 create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST create mode 100644 test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet.index create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-90122746-0b4c-4328-8a04-576ee6b4cb83-0 create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST create mode 100644 test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index ae61f4ae..08bee077 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -356,6 +356,7 @@ if(PAIMON_BUILD_TESTS) common/file_index/bsi/bit_slice_index_roaring_bitmap_test.cpp common/file_index/bloomfilter/bloom_filter_file_index_test.cpp common/file_index/bloomfilter/fast_hash_test.cpp + common/file_index/rangebitmap/range_bitmap_file_index_test.cpp common/global_index/complete_index_score_batch_reader_test.cpp common/global_index/global_index_result_test.cpp common/global_index/global_indexer_factory_test.cpp diff --git a/src/paimon/common/file_index/CMakeLists.txt b/src/paimon/common/file_index/CMakeLists.txt index 085b2e45..2daf7195 100644 --- a/src/paimon/common/file_index/CMakeLists.txt +++ b/src/paimon/common/file_index/CMakeLists.txt @@ -23,7 +23,15 @@ set(PAIMON_FILE_INDEX_SRC bsi/bit_slice_index_roaring_bitmap.cpp bloomfilter/bloom_filter_file_index.cpp bloomfilter/bloom_filter_file_index_factory.cpp - bloomfilter/fast_hash.cpp) + bloomfilter/fast_hash.cpp + rangebitmap/range_bitmap_file_index.cpp + rangebitmap/range_bitmap_file_index_factory.cpp + rangebitmap/range_bitmap.cpp + rangebitmap/bit_slice_index_bitmap.cpp + rangebitmap/dictionary/chunked_dictionary.cpp + rangebitmap/dictionary/fixed_length_chunk.cpp + rangebitmap/dictionary/key_factory.cpp + rangebitmap/utils/literal_serialization_utils.cpp) add_paimon_lib(paimon_file_index SOURCES diff --git a/src/paimon/common/file_index/file_index_format_test.cpp b/src/paimon/common/file_index/file_index_format_test.cpp index 2e000324..3eeca392 100644 --- a/src/paimon/common/file_index/file_index_format_test.cpp +++ b/src/paimon/common/file_index/file_index_format_test.cpp @@ -22,9 +22,11 @@ #include "paimon/common/file_index/bloomfilter/bloom_filter_file_index.h" #include "paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.h" #include "paimon/common/file_index/empty/empty_file_index_reader.h" +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" #include "paimon/data/timestamp.h" -#include "paimon/defs.h" #include "paimon/file_index/file_index_result.h" +#include "paimon/file_index/file_indexer_factory.h" #include "paimon/fs/local/local_file_system.h" #include "paimon/io/byte_array_input_stream.h" #include "paimon/memory/memory_pool.h" @@ -149,6 +151,50 @@ TEST_F(FileIndexFormatTest, TestSimple) { } } +// index file generated by paimon Java implementation +// type: int32 +// data: 17,3,5,7,9,null,null,10 +TEST_F(FileIndexFormatTest, TestRangeBitmapCompatibleWithJava) { + const auto schema = arrow::schema({arrow::field("data", arrow::int32())}); + const auto index_file_bytes = + std::make_unique>(std::initializer_list{ + 0, 5, 78, 78, 208, 26, 53, 174, 0, 0, 0, 1, 0, 0, 0, 56, 0, 0, 0, + 1, 0, 4, 100, 97, 116, 97, 0, 0, 0, 1, 0, 12, 114, 97, 110, 103, 101, 45, + 98, 105, 116, 109, 97, 112, 0, 0, 0, 56, 0, 0, 0, 210, 0, 0, 0, 0, 0, + 0, 0, 21, 1, 0, 0, 0, 8, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, + 17, 0, 0, 0, 66, 0, 0, 0, 13, 1, 0, 0, 0, 1, 0, 0, 0, 4, 0, + 0, 0, 25, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 0, 0, 20, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, + 0, 7, 0, 0, 0, 9, 0, 0, 0, 10, 0, 0, 0, 17, 0, 0, 0, 34, 1, + 3, 0, 0, 0, 19, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 22, 0, 0, + 0, 22, 0, 0, 0, 20, 0, 0, 0, 42, 0, 0, 0, 20, 59, 48, 0, 0, 1, + 0, 0, 5, 0, 2, 0, 0, 0, 4, 0, 7, 0, 0, 0, 58, 48, 0, 0, 1, + 0, 0, 0, 0, 0, 2, 0, 16, 0, 0, 0, 0, 0, 2, 0, 4, 0, 58, 48, + 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 16, 0, 0, 0, 3, 0, 4, 0, 58, + 48, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 16, 0, 0, 0, 0, 0, 7, 0, + }); + const auto input_stream = std::make_shared( + reinterpret_cast(index_file_bytes->data()), index_file_bytes->size()); + ASSERT_OK_AND_ASSIGN(const auto reader, FileIndexFormat::CreateReader(input_stream, pool_)); + ASSERT_OK_AND_ASSIGN(const auto index_file_readers, + reader->ReadColumnIndex("data", CreateArrowSchema(schema).get())); + ASSERT_EQ(1, index_file_readers.size()); + auto* range_bitmap_reader = + dynamic_cast(index_file_readers[0].get()); + ASSERT_TRUE(range_bitmap_reader); + + ASSERT_OK_AND_ASSIGN(const auto eq_result, range_bitmap_reader->VisitEqual(Literal(3))); + ASSERT_TRUE(eq_result); + ASSERT_EQ(eq_result->ToString(), "{1}"); + + ASSERT_OK_AND_ASSIGN(const auto lt_result, range_bitmap_reader->VisitLessThan(Literal(10))); + ASSERT_TRUE(lt_result); + ASSERT_EQ(lt_result->ToString(), "{1,2,3,4}"); + + ASSERT_OK_AND_ASSIGN(const auto gt_result, range_bitmap_reader->VisitIsNull()); + ASSERT_EQ(gt_result->ToString(), "{5,6}"); +} + // NOLINTNEXTLINE(google-readability-function-size) TEST_F(FileIndexFormatTest, TestBitmapIndexWithTimestamp) { auto schema = arrow::schema({ @@ -816,5 +862,4 @@ TEST_F(FileIndexFormatTest, TestBitmapIndexWithTimestamp) { check_nano("ts_nano"); check_nano("ts_tz_nano"); } - } // namespace paimon::test diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp new file mode 100644 index 00000000..3005100b --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp @@ -0,0 +1,276 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h" + +#include + +#include +#include +#include + +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +Result> BitSliceIndexBitmap::Create( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + const int32_t offset) { + const auto data_in = std::make_unique(input_stream); + PAIMON_RETURN_NOT_OK(data_in->Seek(offset)); + PAIMON_ASSIGN_OR_RAISE(const auto header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto version, data_in->ReadValue()); + if (version != CURRENT_VERSION) { + return Status::Invalid("Unknown BitSliceBitmap Version"); + } + PAIMON_ASSIGN_OR_RAISE(const auto slices_size, data_in->ReadValue()); + auto slices = std::vector>(); + slices.resize(slices_size); + PAIMON_ASSIGN_OR_RAISE(const auto ebm_size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto indexes_length, data_in->ReadValue()); + auto indexes = Bytes::AllocateBytes(indexes_length, pool.get()); + PAIMON_RETURN_NOT_OK(data_in->Read(indexes->data(), indexes_length)); + auto body_offset = offset + sizeof(int32_t) + header_length; + return std::make_unique(pool, indexes_length, std::move(indexes), ebm_size, + slices_size, input_stream, body_offset); +} + +static int32_t NumberOfLeadingZeros(const int64_t value) { + if (value == 0) { + return 64; + } + return __builtin_clzll(static_cast(value)); +} + +static int32_t NumberOfTrailingZeros(const int64_t value) { + if (value == 0) { + return 64; + } + return __builtin_ctzll(static_cast(value)); +} + +BitSliceIndexBitmap::BitSliceIndexBitmap(const std::shared_ptr& pool, + const int32_t indexes_length, + PAIMON_UNIQUE_PTR indexes, const int32_t ebm_length, + const int32_t slices_size, + const std::shared_ptr& input_stream, + const int32_t body_offset) + : pool_(pool), + initialized_(false), + bit_slices_(std::vector>(slices_size, {std::nullopt})), + ebm({std::nullopt}), + input_stream_(input_stream), + body_offset_(body_offset), + indexes_(std::move(indexes)), + ebm_length_(ebm_length), + indexes_length_(indexes_length) {} + +Result BitSliceIndexBitmap::GetEmptyBitmap() { + if (!ebm.has_value()) { + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_, FS_SEEK_SET)); + const auto bytes = Bytes::AllocateBytes(ebm_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), ebm_length_)); + RoaringBitmap32 bitmap; + PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data(), ebm_length_)); + ebm = bitmap; + } + return &ebm.value(); +} + +Result BitSliceIndexBitmap::GetSliceBitmap(const int32_t idx) { + if (!bit_slices_[idx].has_value()) { + const auto data_in = std::make_unique( + std::make_shared(indexes_->data(), indexes_length_)); + const int position = static_cast(2 * sizeof(int32_t) * idx); + PAIMON_RETURN_NOT_OK(data_in->Seek(position)); + PAIMON_ASSIGN_OR_RAISE(const auto offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto length, data_in->ReadValue()); + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_ + ebm_length_ + offset, FS_SEEK_SET)); + RoaringBitmap32 bitmap; + const auto bytes = Bytes::AllocateBytes(length, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), length)); + PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data(), length)); + bit_slices_[idx] = bitmap; + } + return &bit_slices_[idx].value(); +} + +/// Batch load slices from start to end to avoid unnecessary IO +Status BitSliceIndexBitmap::LoadSlices(const int32_t start, const int32_t end) { + if (initialized_) { + return Status::OK(); + } + auto indexes_stream = std::make_shared(indexes_->data(), indexes_length_); + const auto data_in = std::make_unique(indexes_stream); + const auto position = static_cast(2 * sizeof(int32_t) * start); + PAIMON_RETURN_NOT_OK(data_in->Seek(position)); + PAIMON_ASSIGN_OR_RAISE(const auto offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(auto length, data_in->ReadValue()); + std::vector lengths(end); + lengths[start] = length; + + for (int32_t i = start + 1; i < end; ++i) { + PAIMON_RETURN_NOT_OK(data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto slice_length, data_in->ReadValue()); + lengths[i] = slice_length; + length += slice_length; + } + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_ + ebm_length_ + offset, FS_SEEK_SET)); + const auto bytes = Bytes::AllocateBytes(length, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), length)); + int32_t byte_position = 0; + for (int32_t i = start; i < end; ++i) { + const int32_t slice_length = lengths[i]; + RoaringBitmap32 bitmap; + PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data() + byte_position, slice_length)); + bit_slices_[i] = std::move(bitmap); + byte_position += slice_length; + } + initialized_ = true; + return Status::OK(); +} + +Result BitSliceIndexBitmap::Eq(const int32_t code) { + PAIMON_ASSIGN_OR_RAISE(const auto empty_bitmap, GetEmptyBitmap()); + auto equal = RoaringBitmap32(*empty_bitmap); + for (int32_t i = static_cast(bit_slices_.size()) - 1; i >= 0; --i) { + PAIMON_ASSIGN_OR_RAISE(const auto slice_bitmap, GetSliceBitmap(i)); + if ((code >> i & 1) == 1) { + equal &= *slice_bitmap; + } else { + equal -= *slice_bitmap; + } + } + return equal; +} + +Result BitSliceIndexBitmap::Gt(const int32_t code) { + if (code < 0) { + return IsNotNull({}); + } + PAIMON_ASSIGN_OR_RAISE(const auto found_set, IsNotNull({})); + if (found_set.IsEmpty()) { + return RoaringBitmap32(); + } + auto state = RoaringBitmap32{}; + auto state_inited = false; + const auto start = NumberOfTrailingZeros(~code); + PAIMON_RETURN_NOT_OK(LoadSlices(start, static_cast(bit_slices_.size()))); + for (int i = start; i < static_cast(bit_slices_.size()); ++i) { + if (!state_inited) { + PAIMON_ASSIGN_OR_RAISE(const auto slice_ptr, GetSliceBitmap(i)); + state = *slice_ptr; + state_inited = true; + continue; + } + const auto bit = code >> i & 1; + PAIMON_ASSIGN_OR_RAISE(const auto slice_ptr, GetSliceBitmap(i)); + if (bit == 1) { + state &= *slice_ptr; + } else { + state |= *slice_ptr; + } + } + if (!state_inited) { + return RoaringBitmap32(); + } + return state &= found_set; +} + +Result BitSliceIndexBitmap::Gte(const int32_t code) { + return Gt(code - 1); +} + +Result BitSliceIndexBitmap::IsNotNull(const RoaringBitmap32& found_set) { + if (!ebm.has_value()) { + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_, FS_SEEK_SET)); + const auto bytes = Bytes::AllocateBytes(ebm_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), ebm_length_)); + RoaringBitmap32 bitmap; + PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data(), ebm_length_)); + ebm = bitmap; + } + return found_set.IsEmpty() ? ebm.value() : RoaringBitmap32::And(ebm.value(), found_set); +} + +BitSliceIndexBitmap::Appender::Appender(const std::shared_ptr& pool, const int32_t min, + const int32_t max) + : pool_(pool), min_(min), max_(max) { + ebm_ = RoaringBitmap32{}; + const auto slices_size = std::max(64 - NumberOfLeadingZeros(max), 1); + slices_.resize(slices_size); +} + +Status BitSliceIndexBitmap::Appender::Append(const int32_t key, const int32_t value) { + if (key < 0) { + return Status::Invalid(fmt::format("Invalid key: {}", key)); + } + if (value < min_ || value > max_) { + return Status::Invalid(fmt::format("value not in range [{}, {}]", min_, max_)); + } + int bits = value; + while (bits != 0) { + slices_[NumberOfTrailingZeros(bits)].Add(key); + bits &= (bits - 1); + } + ebm_.Add(key); + return Status::OK(); +} + +Result> BitSliceIndexBitmap::Appender::Serialize() const { + const auto indexes_length = static_cast(2 * sizeof(int32_t) * slices_.size()); + const auto ebm_bytes = ebm_.Serialize(pool_.get()); + const auto ebm_length = static_cast(ebm_bytes->size()); + int32_t header_size = 0; + header_size += sizeof(int8_t); // version + header_size += sizeof(int8_t); // slices size + header_size += sizeof(int32_t); // ebm length + header_size += sizeof(int32_t); // indexes length + header_size += indexes_length; + int32_t offset = 0; + const auto data_output_stream = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + auto slices_bytes_vector = std::vector>{}; + auto indexes_vector = std::vector>{}; + for (const auto& slice : slices_) { + auto slice_bytes = slice.Serialize(pool_.get()); + const auto length = static_cast(slice_bytes->size()); + indexes_vector.emplace_back(offset, length); + offset += length; + slices_bytes_vector.emplace_back(std::move(slice_bytes)); + } + data_output_stream->WriteValue(header_size); + data_output_stream->WriteValue(CURRENT_VERSION); + data_output_stream->WriteValue(static_cast(slices_.size())); + data_output_stream->WriteValue(ebm_length); + data_output_stream->WriteValue(indexes_length); + for (const auto& [slice_offset, length] : indexes_vector) { + data_output_stream->WriteValue(slice_offset); + data_output_stream->WriteValue(length); + } + data_output_stream->Write(ebm_bytes->data(), ebm_length); + for (const auto& slice_bytes : slices_bytes_vector) { + data_output_stream->Write(slice_bytes->data(), slice_bytes->size()); + } + return MemorySegmentUtils::CopyToBytes(data_output_stream->Segments(), 0, + static_cast(data_output_stream->CurrentSize()), + pool_.get()); +} +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h new file mode 100644 index 00000000..e14679b5 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h @@ -0,0 +1,82 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/memory/bytes.h" +#include "paimon/result.h" +#include "paimon/status.h" +#include "paimon/utils/roaring_bitmap32.h" + +namespace paimon { + +class BitSliceIndexBitmap { + public: + static constexpr int CURRENT_VERSION = 1; + static Result> Create( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + int32_t offset); + + BitSliceIndexBitmap(const std::shared_ptr& pool, int32_t indexes_length, + PAIMON_UNIQUE_PTR indexes, int32_t ebm_length, int32_t slices_size, + const std::shared_ptr& input_stream, int32_t body_offset); + + Result GetEmptyBitmap(); + + Result GetSliceBitmap(int32_t idx); + + Status LoadSlices(int32_t start, int32_t end); + + Result Eq(int32_t code); + + Result Gt(int32_t code); + + Result Gte(int32_t code); + + Result IsNotNull(const RoaringBitmap32& found_set); + + class Appender { + public: + Appender(const std::shared_ptr& pool, int32_t min, int32_t max); + Status Append(int32_t key, int32_t value); + Result> Serialize() const; + + private: + std::shared_ptr pool_; + int32_t min_; + int32_t max_; + RoaringBitmap32 ebm_; + std::vector slices_; + }; + + private: + std::shared_ptr pool_; + bool initialized_; + std::vector> bit_slices_; + std::optional ebm; + std::shared_ptr input_stream_; + int32_t body_offset_; + PAIMON_UNIQUE_PTR indexes_; + int32_t ebm_length_; + int32_t indexes_length_; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h b/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h new file mode 100644 index 00000000..c3d61803 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h @@ -0,0 +1,88 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include "paimon/memory/bytes.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" + +namespace paimon { + +class Chunk { + public: + virtual ~Chunk() = default; + + virtual Result TryAdd(const Literal& key) = 0; + + virtual Result Find(const Literal& key) { + PAIMON_ASSIGN_OR_RAISE(const auto cmp_with_key, Key().CompareTo(key)); + if (cmp_with_key == 0) { + return Code(); + } + int32_t low = 0; + int32_t high = Size() - 1; + const int32_t base = Code() + 1; + while (low <= high) { + const int32_t mid = low + (high - low) / 2; + PAIMON_ASSIGN_OR_RAISE(auto key_at_mid, GetKey(mid)); + PAIMON_ASSIGN_OR_RAISE(const auto cmp, key_at_mid.CompareTo(key)); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return base + mid; + } + } + return -(base + low + 1); + } + + virtual Result Find(const int32_t code) { + const auto current = Code(); + if (current == code) { + return Key(); + } + const auto index = code - current - 1; + if (index < 0 || index >= Size()) { + return Status::Invalid(fmt::format("Invalid Code {}", code)); + } + return GetKey(index); + } + + virtual const Literal& Key() const = 0; + + virtual int32_t Code() const = 0; + + virtual int32_t Offset() const = 0; + + virtual void SetOffset(int32_t offset) = 0; + + virtual int32_t Size() const = 0; + + virtual Result> SerializeChunk() const = 0; + + virtual Result> SerializeKeys() const = 0; + + protected: + virtual Result GetKey(int32_t index) = 0; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp new file mode 100644 index 00000000..c881218b --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp @@ -0,0 +1,239 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h" + +#include + +#include "fmt/format.h" +#include "paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/fs/file_system.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/memory/bytes.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +/// Firstly do a binary search on chunk representative key +/// If found, return, otherwise, do a binary search inside the chunk +Result ChunkedDictionary::Find(const Literal& key) { + int32_t low = 0; + int32_t high = size_ - 1; + while (low <= high) { + const int32_t mid = low + (high - low) / 2; + PAIMON_ASSIGN_OR_RAISE(const auto chunk, GetChunk(mid)); + PAIMON_ASSIGN_OR_RAISE(const int32_t result, chunk->Key().CompareTo(key)); + if (result > 0) { + high = mid - 1; + } else if (result < 0) { + low = mid + 1; + } else { + return chunk->Code(); + } + } + if (low == 0) { + return -(low + 1); + } + PAIMON_ASSIGN_OR_RAISE(const auto prev_chunk, GetChunk(low - 1)); + return prev_chunk->Find(key); +} + +Result ChunkedDictionary::Find(const int32_t code) { + if (code < 0) { + return Status::Invalid("Invalid code: " + std::to_string(code)); + } + int32_t low = 0; + int32_t high = size_ - 1; + + while (low <= high) { + const int32_t mid = low + (high - low) / 2; + PAIMON_ASSIGN_OR_RAISE(const auto chunk, GetChunk(mid)); + + auto const chunk_code = chunk->Code(); + if (chunk_code > code) { + high = mid - 1; + } else if (chunk_code < code) { + low = mid + 1; + } else { + return {chunk->Key()}; + } + } + PAIMON_ASSIGN_OR_RAISE(const auto prev_chunk, GetChunk(low - 1)); + return prev_chunk->Find(code); +} + +/// MMap a chunk and store it into cache, keys in the chunk will be lazy loaded later +Result> ChunkedDictionary::GetChunk(int32_t index) { + if (index < 0 || index >= size_) { + return Status::Invalid(fmt::format("Invalid chunk index: {}", index)); + } + if (offsets_bytes_ == nullptr || chunks_bytes_ == nullptr) { + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_, FS_SEEK_SET)); + auto offsets = Bytes::AllocateBytes(offsets_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(offsets->data(), offsets_length_)); + offsets_bytes_ = std::move(offsets); + auto chunks = Bytes::AllocateBytes(chunks_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(chunks->data(), chunks_length_)); + chunks_bytes_ = std::move(chunks); + } + if (chunks_cache_[index]) { + return chunks_cache_[index]; + } + const auto data_in = std::make_unique( + std::make_shared(offsets_bytes_->data(), offsets_length_)); + PAIMON_RETURN_NOT_OK(data_in->Seek(sizeof(int32_t) * index)); + PAIMON_ASSIGN_OR_RAISE(const auto chunk_offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE( + auto chunk, + factory_->MmapChunk(pool_, input_stream_, body_offset_ + offsets_length_ + chunk_offset, + body_offset_ + chunks_length_ + offsets_length_)); + chunks_cache_[index] = std::move(chunk); + return chunks_cache_[index]; +} + +ChunkedDictionary::Appender::Appender(const std::shared_ptr& pool, + const std::shared_ptr& key_factory, + const int32_t chunk_size_bytes) + : pool_(pool), + key_factory_(key_factory), + chunk_size_bytes_(chunk_size_bytes), + chunk_(nullptr), + size_(0), + key_offset_(0), + chunks_offset_(0) { + chunks_output_ = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + keys_output_ = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + offsets_output_ = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); +} + +Status ChunkedDictionary::Appender::AppendSorted(const Literal& key, int32_t code) { + if (key.IsNull()) { + return Status::Invalid("key should not be null"); + } + if (last_key_.has_value()) { + PAIMON_ASSIGN_OR_RAISE(const auto compare_result, last_key_->CompareTo(key)); + if (compare_result >= 0) { + return Status::Invalid("key must be in sorted order"); + } + } + if (last_code_.has_value() && code <= last_code_) { + return Status::Invalid("code must be in sorted order"); + } + last_key_ = key; + last_code_ = code; + if (chunk_ == nullptr) { + PAIMON_ASSIGN_OR_RAISE(chunk_, + key_factory_->CreateChunk(pool_, key, code, chunk_size_bytes_)); + } else { + PAIMON_ASSIGN_OR_RAISE(const auto success, chunk_->TryAdd(key)); + if (success) return Status::OK(); + PAIMON_RETURN_NOT_OK(Flush()); + PAIMON_ASSIGN_OR_RAISE(chunk_, + key_factory_->CreateChunk(pool_, key, code, chunk_size_bytes_)); + } + return Status::OK(); +} + +Result> ChunkedDictionary::Appender::Serialize() { + if (chunk_ != nullptr) { + PAIMON_RETURN_NOT_OK(Flush()); + } + int32_t header_size = 0; + header_size += sizeof(int8_t); // version + header_size += sizeof(int32_t); // size + header_size += sizeof(int32_t); // offsets length + header_size += sizeof(int32_t); // chunks length + const auto data_out = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + data_out->WriteValue(header_size); + data_out->WriteValue(CURRENT_VERSION); + data_out->WriteValue(size_); + data_out->WriteValue(static_cast(offsets_output_->CurrentSize())); + data_out->WriteValue(static_cast(chunks_output_->CurrentSize())); + PAIMON_RETURN_NOT_OK(MemorySegmentUtils::CopyToStream( + offsets_output_->Segments(), 0, static_cast(offsets_output_->CurrentSize()), + data_out.get())); + PAIMON_RETURN_NOT_OK(MemorySegmentUtils::CopyToStream( + chunks_output_->Segments(), 0, static_cast(chunks_output_->CurrentSize()), + data_out.get())); + PAIMON_RETURN_NOT_OK(MemorySegmentUtils::CopyToStream( + keys_output_->Segments(), 0, static_cast(keys_output_->CurrentSize()), + data_out.get())); + return MemorySegmentUtils::CopyToBytes( + data_out->Segments(), 0, static_cast(data_out->CurrentSize()), pool_.get()); +} + +Status ChunkedDictionary::Appender::Flush() { + chunk_->SetOffset(key_offset_); + PAIMON_ASSIGN_OR_RAISE(const auto chunks_bytes, chunk_->SerializeChunk()); + PAIMON_ASSIGN_OR_RAISE(const auto keys_bytes, chunk_->SerializeKeys()); + offsets_output_->WriteValue(chunks_offset_); + chunks_offset_ += static_cast(chunks_bytes->size()); + key_offset_ += static_cast(keys_bytes->size()); + chunks_output_->Write(chunks_bytes->data(), chunks_bytes->size()); + keys_output_->Write(keys_bytes->data(), keys_bytes->size()); + size_ += 1; + chunk_ = nullptr; + return Status::OK(); +} + +Result> ChunkedDictionary::Create( + const std::shared_ptr& pool, const FieldType field_type, + const std::shared_ptr& input_stream, const int64_t offset) { + const auto data_in = std::make_unique(input_stream); + PAIMON_RETURN_NOT_OK(data_in->Seek(offset)); + PAIMON_ASSIGN_OR_RAISE(const auto header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto version, data_in->ReadValue()); + if (version != CURRENT_VERSION) { + return Status::Invalid("Unknown version of ChunkedDictionary"); + } + PAIMON_ASSIGN_OR_RAISE(const auto size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto offsets_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto chunks_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(auto factory, KeyFactory::Create(field_type)); + const auto factory_shared = std::shared_ptr{std::move(factory)}; + auto result = std::make_unique( + pool, input_stream, offset, field_type, factory_shared, size, offsets_length, chunks_length, + offset + header_length + sizeof(int32_t)); + return result; +} + +ChunkedDictionary::ChunkedDictionary(const std::shared_ptr& pool, + const std::shared_ptr& input_stream, + const int64_t start_of_dictionary, const FieldType field_type, + const std::shared_ptr& factory, const int32_t size, + const int32_t offsets_length, const int32_t chunks_length, + const int64_t body_offset) + : pool_(pool), + field_type_(field_type), + factory_(factory), + input_stream_(input_stream), + start_of_dictionary_(start_of_dictionary), + size_(size), + offsets_length_(offsets_length), + chunks_length_(chunks_length), + body_offset_(body_offset), + offsets_bytes_(nullptr), + chunks_bytes_(nullptr), + chunks_cache_(std::vector>(size)) {} +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h new file mode 100644 index 00000000..bfdc0435 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h @@ -0,0 +1,96 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/rangebitmap/dictionary/chunk.h" +#include "paimon/common/file_index/rangebitmap/dictionary/dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/result.h" + +namespace paimon { + +class InputStream; +class MemoryPool; + +class ChunkedDictionary final : public Dictionary { + public: + static constexpr int8_t CURRENT_VERSION = 1; + + Result Find(const Literal& key) override; + + Result Find(int32_t code) override; + + Result> GetChunk(int32_t index); + + class Appender final : public Dictionary::Appender { + public: + Appender(const std::shared_ptr& pool, + const std::shared_ptr& key_factory, int32_t chunk_size_bytes); + Status AppendSorted(const Literal& key, int32_t code) override; + Result> Serialize() override; + + private: + Status Flush(); + + std::shared_ptr pool_; + std::shared_ptr key_factory_; + int32_t chunk_size_bytes_; + std::optional last_key_; + std::optional last_code_; + std::unique_ptr chunk_; + int32_t size_; + int32_t key_offset_; + int32_t chunks_offset_; + std::unique_ptr chunks_output_; + std::unique_ptr keys_output_; + std::unique_ptr offsets_output_; + }; + + static Result> Create( + const std::shared_ptr& pool, FieldType field_type, + const std::shared_ptr& input_stream, int64_t offset); + + explicit ChunkedDictionary(const std::shared_ptr& pool, + const std::shared_ptr& input_stream, + int64_t start_of_dictionary, FieldType field_type, + const std::shared_ptr& factory, int32_t size, + int32_t offsets_length, int32_t chunks_length, int64_t body_offset); + std::shared_ptr pool_; + FieldType field_type_; + std::shared_ptr factory_; + + std::shared_ptr input_stream_; + int64_t start_of_dictionary_; + int32_t size_; // number of chunks + int32_t offsets_length_; // bytes length of offsets + int32_t chunks_length_; // bytes length of chunks + int64_t body_offset_; // where offsets start + + // for lazy loading + PAIMON_UNIQUE_PTR offsets_bytes_; + PAIMON_UNIQUE_PTR chunks_bytes_; + + // mmap chunks cache + std::vector> chunks_cache_; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h b/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h new file mode 100644 index 00000000..fb47a0f6 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h @@ -0,0 +1,45 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "paimon/memory/bytes.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" + +namespace paimon { + +class Dictionary { + public: + virtual ~Dictionary() = default; + + virtual Result Find(const Literal& key) = 0; + + virtual Result Find(int32_t code) = 0; + + class Appender { + public: + virtual ~Appender() = default; + + virtual Status AppendSorted(const Literal& key, int32_t code) = 0; + + virtual Result> Serialize() = 0; + }; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp new file mode 100644 index 00000000..570b4c58 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp @@ -0,0 +1,127 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h" + +#include +#include + +#include "fmt/format.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/memory/bytes.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +Result FixedLengthChunk::TryAdd(const Literal& key) { + if (keys_stream_out_ == nullptr) { + keys_stream_out_ = std::make_shared( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + PAIMON_ASSIGN_OR_RAISE(serializer_, factory_->CreateSerializer()); + } + if (fixed_length_ > remaining_keys_size_) { + return false; + } + PAIMON_RETURN_NOT_OK((*serializer_)(keys_stream_out_, key)); + remaining_keys_size_ -= fixed_length_; + size_ += 1; + return true; +} + +Result FixedLengthChunk::GetKey(const int32_t index) { + if (index < 0 || index >= size_) { + return Status::Invalid("Index out of bounds"); + } + if (keys_ == nullptr) { + PAIMON_RETURN_NOT_OK(input_stream_->Seek(keys_base_offset_ + offset_, FS_SEEK_SET)); + keys_ = Bytes::AllocateBytes(keys_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(keys_->data(), keys_length_)); + PAIMON_ASSIGN_OR_RAISE(deserializer_, factory_->CreateDeserializer()); + keys_stream_in_ = std::make_shared( + std::make_shared(keys_->data(), keys_length_)); + } + PAIMON_RETURN_NOT_OK(keys_stream_in_->Seek(index * fixed_length_)); + return (*deserializer_)(keys_stream_in_, pool_.get()); +} + +Result> FixedLengthChunk::SerializeChunk() const { + const auto data_out = std::make_shared( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + data_out->WriteValue(CURRENT_VERSION); + PAIMON_RETURN_NOT_OK((*serializer_)(data_out, key_)); + data_out->WriteValue(code_); + data_out->WriteValue(offset_); + data_out->WriteValue(size_); + data_out->WriteValue(static_cast(keys_stream_out_->CurrentSize())); + data_out->WriteValue(fixed_length_); + return MemorySegmentUtils::CopyToBytes( + data_out->Segments(), 0, static_cast(data_out->CurrentSize()), pool_.get()); +} + +Result> FixedLengthChunk::SerializeKeys() const { + return MemorySegmentUtils::CopyToBytes(keys_stream_out_->Segments(), 0, + static_cast(keys_stream_out_->CurrentSize()), + pool_.get()); +} + +// Read path +FixedLengthChunk::FixedLengthChunk(const std::shared_ptr& pool, Literal key, + const int32_t code, const int32_t offset, const int32_t size, + const std::shared_ptr& factory, + const std::shared_ptr& input_stream, + const int32_t keys_base_offset, const int32_t keys_length, + const int32_t fixed_length) + : pool_(pool), + key_(std::move(key)), + code_(code), + offset_(offset), + size_(size), + factory_(factory), + input_stream_(input_stream), + keys_base_offset_(keys_base_offset), + keys_length_(keys_length), + fixed_length_(fixed_length), + deserializer_({std::nullopt}), + keys_stream_in_(nullptr), + keys_(nullptr), + remaining_keys_size_(0) {} + +// Write path +FixedLengthChunk::FixedLengthChunk(const std::shared_ptr& pool, Literal key, + const int32_t code, const int32_t keys_length_limit, + const std::shared_ptr& factory, + const int32_t fixed_length) + : pool_(pool), + key_(std::move(key)), + code_(code), + offset_(0), + size_(0), + factory_(factory), + input_stream_(nullptr), + keys_base_offset_(0), + keys_length_(0), + fixed_length_(fixed_length), + deserializer_({std::nullopt}), + keys_stream_in_(nullptr), + keys_(nullptr), + remaining_keys_size_(keys_length_limit) {} + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h new file mode 100644 index 00000000..d1f75c72 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h @@ -0,0 +1,92 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/rangebitmap/dictionary/chunk.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +class InputStream; +class MemoryPool; + +class FixedLengthChunk final : public Chunk { + public: + static constexpr int8_t CURRENT_VERSION = 1; + + Result TryAdd(const Literal& key) override; + Result GetKey(int32_t index) override; + const Literal& Key() const override { + return key_; + } + int32_t Code() const override { + return code_; + } + int32_t Offset() const override { + return offset_; + } + + void SetOffset(const int32_t offset) override { + offset_ = offset; + } + + int32_t Size() const override { + return size_; + } + Result> SerializeChunk() const override; + Result> SerializeKeys() const override; + // For Read Path + FixedLengthChunk(const std::shared_ptr& pool, Literal key, int32_t code, + int32_t offset, int32_t size, const std::shared_ptr& factory, + const std::shared_ptr& input_stream, int32_t keys_base_offset, + int32_t keys_length, int32_t fixed_length); + // For Write Path + FixedLengthChunk(const std::shared_ptr& pool, Literal key, int32_t code, + int32_t keys_length_limit, const std::shared_ptr& factory, + int32_t fixed_length); + + std::shared_ptr pool_; + Literal key_; // representative key for binary search + int32_t code_; // first code in this chunk + int32_t offset_; // offset of this chunk + int32_t size_; // number of keys in this chunk + std::shared_ptr factory_; // factory for serialization/deserialization + + // For read path lazy keys loading + std::shared_ptr input_stream_; + int32_t keys_base_offset_; + int32_t keys_length_; + int32_t fixed_length_; + std::optional deserializer_; + std::shared_ptr keys_stream_in_; + PAIMON_UNIQUE_PTR keys_; + + // For write path + std::optional serializer_; + std::shared_ptr keys_stream_out_; + int64_t remaining_keys_size_; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp new file mode 100644 index 00000000..d79b1b2b --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp @@ -0,0 +1,122 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" + +#include "fmt/format.h" +#include "paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h" +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" +#include "paimon/common/utils/field_type_utils.h" + +namespace paimon { + +Result> KeyFactory::Create(const FieldType field_type) { + // todo: support timestamp + switch (field_type) { + case FieldType::BOOLEAN: + return std::make_unique(); + case FieldType::TINYINT: + return std::make_unique(); + case FieldType::SMALLINT: + return std::make_unique(); + case FieldType::DATE: + return std::make_unique(); + case FieldType::INT: + return std::make_unique(); + case FieldType::BIGINT: + return std::make_unique(); + case FieldType::FLOAT: + return std::make_unique(); + case FieldType::DOUBLE: + return std::make_unique(); + default: + return Status::Invalid(fmt::format("Unsupported field type for KeyFactory: {}", + FieldTypeUtils::FieldTypeToString(field_type))); + } +} + +const std::string& KeyFactory::GetDefaultChunkSize() { + static const std::string kDefaultChunkSize = "16kb"; + return kDefaultChunkSize; +} + +Result> FixedLengthKeyFactory::CreateChunk( + const std::shared_ptr& pool, const Literal& key, const int32_t code, + const int32_t keys_length_limit) { + return std::make_unique(pool, key, code, keys_length_limit, + this->shared_from_this(), this->GetFieldSize()); +} + +Result> FixedLengthKeyFactory::MmapChunk( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + const int32_t chunk_offest, const int32_t keys_base_offset) { + PAIMON_RETURN_NOT_OK(input_stream->Seek(chunk_offest, FS_SEEK_SET)); + PAIMON_ASSIGN_OR_RAISE(const auto deserializer, this->CreateDeserializer()); + const auto data_in = std::make_shared(input_stream); + PAIMON_ASSIGN_OR_RAISE(const auto version, data_in->ReadValue()); + if (version != ChunkedDictionary::CURRENT_VERSION) { + return Status::Invalid(fmt::format("Unsupported version for KeyFactory: {}", version)); + } + PAIMON_ASSIGN_OR_RAISE(const auto key_literal, deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(const auto code, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto keys_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto fixed_length, data_in->ReadValue()); + return std::make_unique(pool, key_literal, code, offset, size, + this->shared_from_this(), input_stream, + keys_base_offset, keys_length, fixed_length); +} + +Result FixedLengthKeyFactory::CreateSerializer() { + return KeySerializer([this](const std::shared_ptr& out, + const Literal& literal) -> Status { + PAIMON_ASSIGN_OR_RAISE(const auto writer, + LiteralSerializationUtils::CreateValueWriter(GetFieldType(), out)); + return writer(literal); + }); +} + +Result FixedLengthKeyFactory::CreateDeserializer() { + return KeyDeserializer([this](const std::shared_ptr& in, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE( + auto reader, LiteralSerializationUtils::CreateValueReader(GetFieldType(), in, pool)); + return reader(); + }); +} + +Result> VariableLengthKeyFactory::CreateChunk( + const std::shared_ptr& pool, const Literal& key, int32_t code, + int32_t keys_length_limit) { + return Status::NotImplemented("VariableLengthKeyFactory::CreateChunk not implemented"); +} +Result> VariableLengthKeyFactory::MmapChunk( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + int32_t chunk_offest, int32_t keys_base_offset) { + return Status::NotImplemented("VariableLengthKeyFactory::MmapChunk not implemented"); +} + +Result VariableLengthKeyFactory::CreateSerializer() { + return Status::NotImplemented("VariableLengthKeyFactory::CreateSerializer not implemented"); +} + +Result VariableLengthKeyFactory::CreateDeserializer() { + return Status::NotImplemented("VariableLengthKeyFactory::CreateDeserializer not implemented"); +} + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h new file mode 100644 index 00000000..b7df1bed --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h @@ -0,0 +1,179 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "paimon/common/file_index/rangebitmap/dictionary/chunk.h" +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/defs.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +class InputStream; +class MemoryPool; + +class KeyFactory : public std::enable_shared_from_this { + public: + virtual ~KeyFactory() = default; + + virtual FieldType GetFieldType() const = 0; + + using KeySerializer = + std::function&, const Literal&)>; + using KeyDeserializer = + std::function(const std::shared_ptr&, MemoryPool*)>; + + /// For writing new chunk + virtual Result> CreateChunk(const std::shared_ptr& pool, + const Literal& key, int32_t code, + int32_t keys_length_limit) = 0; + + /// For reading existing chunk, lazy loading keys in the directory + virtual Result> MmapChunk( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + int32_t chunk_offest, int32_t keys_base_offset) = 0; + + virtual Result CreateSerializer() = 0; + + virtual Result CreateDeserializer() = 0; + + static Result> Create(FieldType field_type); + + static const std::string& GetDefaultChunkSize(); +}; + +class FixedLengthKeyFactory : public KeyFactory { + public: + Result> CreateChunk(const std::shared_ptr& pool, + const Literal& key, int32_t code, + int32_t keys_length_limit) override; + Result> MmapChunk(const std::shared_ptr& pool, + const std::shared_ptr& input_stream, + int32_t chunk_offest, + int32_t keys_base_offset) override; + Result CreateSerializer() override; + Result CreateDeserializer() override; + virtual size_t GetFieldSize() const = 0; +}; + +class VariableLengthKeyFactory : public KeyFactory { + public: + Result> CreateChunk(const std::shared_ptr& pool, + const Literal& key, int32_t code, + int32_t keys_length_limit) override; + Result> MmapChunk(const std::shared_ptr& pool, + const std::shared_ptr& input_stream, + int32_t chunk_offest, + int32_t keys_base_offset) override; + Result CreateSerializer() override; + Result CreateDeserializer() override; +}; + +class DateKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::DATE; + } + size_t GetFieldSize() const override { + return sizeof(int32_t); + } +}; + +class IntKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::INT; + } + size_t GetFieldSize() const override { + return sizeof(int32_t); + } +}; + +class BigIntKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::BIGINT; + } + size_t GetFieldSize() const override { + return sizeof(int64_t); + } +}; + +class BooleanKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::BOOLEAN; + } + size_t GetFieldSize() const override { + return sizeof(bool); + } +}; + +class TinyIntKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::TINYINT; + } + size_t GetFieldSize() const override { + return sizeof(int8_t); + } +}; + +class SmallIntKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::SMALLINT; + } + size_t GetFieldSize() const override { + return sizeof(int16_t); + } +}; + +class FloatKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::FLOAT; + } + size_t GetFieldSize() const override { + return sizeof(float); + } +}; + +class DoubleKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::DOUBLE; + } + size_t GetFieldSize() const override { + return sizeof(double); + } +}; + +class StringKeyFactory final : public VariableLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::STRING; + } +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp new file mode 100644 index 00000000..423c85e0 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp @@ -0,0 +1,302 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" + +#include + +#include "fmt/format.h" +#include "paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" +#include "paimon/common/io/data_output_stream.h" +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/memory/bytes.h" + +namespace paimon { + +Result> RangeBitmap::Create( + const std::shared_ptr& input_stream, const int64_t offset, + const FieldType field_type, const std::shared_ptr& pool) { + PAIMON_RETURN_NOT_OK(input_stream->Seek(offset, SeekOrigin::FS_SEEK_SET)); + const auto data_in = std::make_shared(input_stream); + PAIMON_ASSIGN_OR_RAISE(const auto header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); + if (version != VERSION) { + return Status::Invalid( + fmt::format("RangeBitmap unsupported version {} (expected {})", version, VERSION)); + } + PAIMON_ASSIGN_OR_RAISE(const auto rid, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(const auto cardinality, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(auto key_factory, KeyFactory::Create(field_type)); + const auto shared_key_factory = std::shared_ptr{std::move(key_factory)}; + PAIMON_ASSIGN_OR_RAISE(const auto key_deserializer, shared_key_factory->CreateDeserializer()); + PAIMON_ASSIGN_OR_RAISE(auto min, key_deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(auto max, key_deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(const auto dictionary_length, data_in->ReadValue()); + const auto dictionary_offset = static_cast(offset + sizeof(int32_t) + header_length); + const auto bsi_offset = dictionary_offset + dictionary_length; + return std::unique_ptr(new RangeBitmap(pool, rid, cardinality, dictionary_offset, + bsi_offset, std::move(min), std::move(max), + shared_key_factory, input_stream)); +} + +Result RangeBitmap::Not(RoaringBitmap32& bitmap) { + bitmap.Flip(0, rid_); + PAIMON_ASSIGN_OR_RAISE(const auto is_not_null, this->IsNotNull()); + return bitmap &= is_not_null; +} + +Result RangeBitmap::Eq(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(const auto min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(const auto max_compare, key.CompareTo(max_)); + PAIMON_ASSIGN_OR_RAISE(const auto bit_slice_ptr, this->GetBitSliceIndex()); + if (min_compare == 0 && max_compare == 0) { + return bit_slice_ptr->IsNotNull({}); + } + if (min_compare < 0 || max_compare > 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(const auto dictionary, this->GetDictionary()); + PAIMON_ASSIGN_OR_RAISE(const auto code, dictionary->Find(key)); + if (code < 0) { + return RoaringBitmap32(); + } + return bit_slice_ptr->Eq(code); +} + +Result RangeBitmap::Neq(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(auto eq_result, Eq(key)); + return Not(eq_result); +} + +Result RangeBitmap::Lt(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(const auto min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(const auto max_compare, key.CompareTo(max_)); + if (max_compare > 0) { + return IsNotNull(); + } + if (min_compare <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(auto gte_result, Gte(key)); + return Not(gte_result); +} + +Result RangeBitmap::Lte(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(const auto min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(const auto max_compare, key.CompareTo(max_)); + if (max_compare >= 0) { + return IsNotNull(); + } + if (min_compare < 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(auto gt_result, Gt(key)); + return Not(gt_result); +} + +Result RangeBitmap::Gt(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(const auto max_compare, key.CompareTo(max_)); + if (max_compare >= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(const auto min_compare, key.CompareTo(min_)); + if (min_compare < 0) { + return IsNotNull(); + } + PAIMON_ASSIGN_OR_RAISE(const auto dictionary, this->GetDictionary()); + PAIMON_ASSIGN_OR_RAISE(const auto code, dictionary->Find(key)); + PAIMON_ASSIGN_OR_RAISE(const auto bit_slice_ptr, this->GetBitSliceIndex()); + if (code >= 0) { + return bit_slice_ptr->Gt(code); + } + return bit_slice_ptr->Gte(-code - 1); +} + +Result RangeBitmap::Gte(const Literal& key) { + PAIMON_ASSIGN_OR_RAISE(auto gt_result, Gt(key)); + PAIMON_ASSIGN_OR_RAISE(const auto eq_result, Eq(key)); + gt_result |= eq_result; + return gt_result; +} + +Result RangeBitmap::In(const std::vector& keys) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + RoaringBitmap32 result{}; + for (const auto& key : keys) { + PAIMON_ASSIGN_OR_RAISE(const auto bitmap, Eq(key)); + result |= bitmap; + } + return result; +} + +Result RangeBitmap::NotIn(const std::vector& keys) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(auto in_result, In(keys)); + return Not(in_result); +} + +Result RangeBitmap::IsNull() { + if (cardinality_ <= 0) { + if (rid_ > 0) { + RoaringBitmap32 result; + result.AddRange(0, rid_); + return result; + } + return RoaringBitmap32(); + } + + PAIMON_ASSIGN_OR_RAISE(auto non_null_bitmap, IsNotNull()); + non_null_bitmap.Flip(0, rid_); + return non_null_bitmap; +} + +Result RangeBitmap::IsNotNull() { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + + PAIMON_ASSIGN_OR_RAISE(const auto bit_slice_ptr, this->GetBitSliceIndex()); + PAIMON_ASSIGN_OR_RAISE(auto result, bit_slice_ptr->IsNotNull({})); + return result; +} + +RangeBitmap::RangeBitmap(const std::shared_ptr& pool, const int32_t rid, + const int32_t cardinality, const int32_t dictionary_offset, + const int32_t bsi_offset, Literal&& min, Literal&& max, + const std::shared_ptr& key_factory, + const std::shared_ptr& input_stream) + : pool_(pool), + rid_(rid), + cardinality_(cardinality), + bsi_offset_(bsi_offset), + dictionary_offset_(dictionary_offset), + min_(std::move(min)), + max_(std::move(max)), + key_factory_(key_factory), + input_stream_(input_stream), + bsi_(nullptr), + dictionary_(nullptr) {} + +Result RangeBitmap::GetBitSliceIndex() { + if (bsi_ == nullptr) { + PAIMON_ASSIGN_OR_RAISE(bsi_, + BitSliceIndexBitmap::Create(pool_, input_stream_, bsi_offset_)); + } + return bsi_.get(); +} + +Result RangeBitmap::GetDictionary() { + if (dictionary_ == nullptr) { + PAIMON_ASSIGN_OR_RAISE(dictionary_, + ChunkedDictionary::Create(pool_, key_factory_->GetFieldType(), + input_stream_, dictionary_offset_)); + } + return dictionary_.get(); +} + +RangeBitmap::Appender::Appender(const std::shared_ptr& pool, + const std::shared_ptr& factory, + const int64_t limited_serialized_size_in_bytes) + : pool_(pool), + rid_(0), + factory_(factory), + limited_serialized_size_in_bytes_(limited_serialized_size_in_bytes) {} + +void RangeBitmap::Appender::Append(const Literal& key) { + if (!key.IsNull()) { + bitmaps_[key].Add(rid_); + } + rid_++; +} + +Result> RangeBitmap::Appender::Serialize() const { + int32_t code = 0; + auto bsi = BitSliceIndexBitmap::Appender(pool_, 0, static_cast(bitmaps_.size() - 1)); + auto dictionary = ChunkedDictionary::Appender( + pool_, factory_, static_cast(limited_serialized_size_in_bytes_)); + for (const auto& [key, bitmap] : bitmaps_) { + PAIMON_RETURN_NOT_OK(dictionary.AppendSorted(key, code)); + for (auto it = bitmap.Begin(); it != bitmap.End(); ++it) { + PAIMON_RETURN_NOT_OK(bsi.Append(*it, code)); + } + code++; + } + PAIMON_ASSIGN_OR_RAISE(const auto serializer, factory_->CreateSerializer()); + auto min = Literal{factory_->GetFieldType()}; + auto max = Literal{factory_->GetFieldType()}; + if (!bitmaps_.empty()) { + min = bitmaps_.begin()->first; + max = bitmaps_.rbegin()->first; + } + PAIMON_ASSIGN_OR_RAISE(const auto min_size, + LiteralSerializationUtils::GetSerializedSizeInBytes(min)); + PAIMON_ASSIGN_OR_RAISE(const auto max_size, + LiteralSerializationUtils::GetSerializedSizeInBytes(max)); + int32_t header_size = 0; + header_size += sizeof(int8_t); // version + header_size += sizeof(int32_t); // rid + header_size += sizeof(int32_t); // cardinality + header_size += min.IsNull() ? 0 : min_size; // min literal size + header_size += max.IsNull() ? 0 : max_size; // max literal size + header_size += sizeof(int32_t); // dictionary length + PAIMON_ASSIGN_OR_RAISE(const auto dictionary_bytes, dictionary.Serialize()); + const auto dictionary_length = static_cast(dictionary_bytes->size()); + PAIMON_ASSIGN_OR_RAISE(const auto bsi_bytes, bsi.Serialize()); + const auto bsi_length = bsi_bytes->size(); + const auto data_output_stream = std::make_shared( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + data_output_stream->WriteValue(header_size); + data_output_stream->WriteValue(VERSION); + data_output_stream->WriteValue(rid_); + data_output_stream->WriteValue(static_cast(bitmaps_.size())); + if (!min.IsNull()) { + PAIMON_RETURN_NOT_OK(serializer(data_output_stream, min)); + } + if (!max.IsNull()) { + PAIMON_RETURN_NOT_OK(serializer(data_output_stream, max)); + } + data_output_stream->WriteValue(dictionary_length); + data_output_stream->Write(dictionary_bytes->data(), dictionary_length); + data_output_stream->Write(bsi_bytes->data(), bsi_length); + return MemorySegmentUtils::CopyToBytes(data_output_stream->Segments(), 0, + static_cast(data_output_stream->CurrentSize()), + pool_.get()); +} +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.h b/src/paimon/common/file_index/rangebitmap/range_bitmap.h new file mode 100644 index 00000000..6d61c6e8 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.h @@ -0,0 +1,102 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h" +#include "paimon/common/file_index/rangebitmap/dictionary/dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/utils/roaring_bitmap32.h" + +namespace paimon { + +class InputStream; +class MemoryPool; + +class RangeBitmap { + public: + static constexpr int8_t VERSION = 1; + + static Result> Create( + const std::shared_ptr& input_stream, int64_t offset, FieldType field_type, + const std::shared_ptr& pool); + + Result Eq(const Literal& key); + Result Neq(const Literal& key); + Result Lt(const Literal& key); + Result Lte(const Literal& key); + Result Gt(const Literal& key); + Result Gte(const Literal& key); + Result In(const std::vector& keys); + Result NotIn(const std::vector& keys); + Result IsNull(); + Result IsNotNull(); + + private: + Result Not(RoaringBitmap32& bitmap); + + RangeBitmap(const std::shared_ptr& pool, int32_t rid, int32_t cardinality, + int32_t dictionary_offset, int32_t bsi_offset, Literal&& min, Literal&& max, + const std::shared_ptr& key_factory, + const std::shared_ptr& input_stream); + Result GetBitSliceIndex(); + Result GetDictionary(); + std::shared_ptr pool_; + int32_t rid_; + int32_t cardinality_; + int32_t bsi_offset_; + int32_t dictionary_offset_; + Literal min_; + Literal max_; + std::shared_ptr key_factory_; + std::shared_ptr input_stream_; + + // For lazy loading + std::unique_ptr bsi_; + std::unique_ptr dictionary_; + + public: + class Appender { + public: + Appender(const std::shared_ptr& pool, + const std::shared_ptr& factory, + int64_t limited_serialized_size_in_bytes); + void Append(const Literal& key); + Result> Serialize() const; + + private: + struct LiteralComparator { + bool operator()(const Literal& lhs, const Literal& rhs) const { + const auto result = lhs.CompareTo(rhs); + return result.ok() && result.value() < 0; + } + }; + std::shared_ptr pool_; + int32_t rid_; + std::map bitmaps_; + std::shared_ptr factory_; + int64_t limited_serialized_size_in_bytes_; + }; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp new file mode 100644 index 00000000..09cf0f9a --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp @@ -0,0 +1,226 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" + +#include +#include +#include + +#include + +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" +#include "paimon/common/options/memory_size.h" +#include "paimon/common/predicate/literal_converter.h" +#include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/file_index/bitmap_index_result.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +RangeBitmapFileIndex::RangeBitmapFileIndex(const std::map& options) + : options_(options) {} + +Result> RangeBitmapFileIndex::CreateReader( + ArrowSchema* const arrow_schema, const int32_t start, const int32_t length, + const std::shared_ptr& input_stream, + const std::shared_ptr& pool) const { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(const auto arrow_schema_ptr, + arrow::ImportSchema(arrow_schema)); + if (arrow_schema_ptr->num_fields() != 1) { + return Status::Invalid( + "invalid schema for RangeBitmapFileIndexReader, supposed to have single field."); + } + const auto arrow_type = arrow_schema_ptr->field(0)->type(); + return RangeBitmapFileIndexReader::Create(arrow_type, start, length, input_stream, pool); +} + +Result> RangeBitmapFileIndex::CreateWriter( + ArrowSchema* arrow_schema, const std::shared_ptr& pool) const { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(const auto arrow_schema_ptr, + arrow::ImportSchema(arrow_schema)); + if (arrow_schema_ptr->num_fields() != 1) { + return Status::Invalid( + "invalid schema for RangeBitmapFileIndexWriter, supposed to have single field."); + } + const auto arrow_field = arrow_schema_ptr->field(0); + return RangeBitmapFileIndexWriter::Create(arrow_schema_ptr, arrow_field->name(), options_, + pool); +} + +Result> RangeBitmapFileIndexWriter::Create( + const std::shared_ptr& arrow_schema, const std::string& field_name, + const std::map& options, const std::shared_ptr& pool) { + const auto field = arrow_schema->GetFieldByName(field_name); + if (!field) { + return Status::Invalid("Field not found in schema: " + field_name); + } + PAIMON_ASSIGN_OR_RAISE(auto field_type, + FieldTypeUtils::ConvertToFieldType(field->type()->id())); + PAIMON_ASSIGN_OR_RAISE(auto key_factory, KeyFactory::Create(field_type)); + const auto shared_key_factory = std::shared_ptr{std::move(key_factory)}; + const auto& chunk_size = KeyFactory::GetDefaultChunkSize(); + PAIMON_ASSIGN_OR_RAISE(auto parsed_chunk_size, MemorySize::ParseBytes(chunk_size)); + if (const auto chunk_size_it = options.find(RangeBitmapFileIndex::CHUNK_SIZE); + chunk_size_it != options.end()) { + PAIMON_ASSIGN_OR_RAISE(parsed_chunk_size, MemorySize::ParseBytes(chunk_size_it->second)); + } + if (parsed_chunk_size > std::numeric_limits::max()) { + return Status::Invalid("Chunk size must be less than 4GB"); + } + auto appender_ptr = + std::make_unique(pool, shared_key_factory, parsed_chunk_size); + return std::make_shared(field->type(), field_type, options, pool, + parsed_chunk_size, shared_key_factory, + std::move(appender_ptr)); +} + +Status RangeBitmapFileIndexWriter::AddBatch(::ArrowArray* batch) { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(const auto array, arrow::ImportArray(batch, arrow_type_)); + PAIMON_ASSIGN_OR_RAISE(const auto array_values, + LiteralConverter::ConvertLiteralsFromArray(*array, true)); + for (const auto& literal : array_values) { + appender_->Append(literal); + } + return Status::OK(); +} + +Result> RangeBitmapFileIndexWriter::SerializedBytes() const { + return appender_->Serialize(); +} + +RangeBitmapFileIndexWriter::RangeBitmapFileIndexWriter( + const std::shared_ptr& arrow_type, const FieldType field_type, + const std::map& options, const std::shared_ptr& pool, + const int64_t chunk_size, const std::shared_ptr& key_factory, + std::unique_ptr appender) + : arrow_type_(arrow_type), + field_type_(field_type), + options_(options), + pool_(pool), + key_factory_(key_factory), + chunk_size_(chunk_size), + appender_(std::move(appender)) {} + +Result> RangeBitmapFileIndexReader::Create( + const std::shared_ptr& arrow_type, const int32_t start, const int32_t length, + const std::shared_ptr& input_stream, const std::shared_ptr& pool) { + if (!arrow_type || !input_stream || !pool) { + return Status::Invalid("RangeBitmapFileIndexReader::Create: null argument"); + } + PAIMON_ASSIGN_OR_RAISE(const FieldType field_type, + FieldTypeUtils::ConvertToFieldType(arrow_type->id())); + PAIMON_ASSIGN_OR_RAISE(auto range_bitmap, + RangeBitmap::Create(input_stream, start, field_type, pool)); + return std::shared_ptr( + new RangeBitmapFileIndexReader(std::move(range_bitmap))); +} + +RangeBitmapFileIndexReader::RangeBitmapFileIndexReader(std::unique_ptr range_bitmap) + : range_bitmap_(std::move(range_bitmap)) {} + +Result> RangeBitmapFileIndexReader::VisitEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Eq(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitNotEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Neq(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitIn( + const std::vector& literals) { + return std::make_shared( + [self = shared_from_this(), literals]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->In(literals); + }); +} + +Result> RangeBitmapFileIndexReader::VisitNotIn( + const std::vector& literals) { + return std::make_shared( + [self = shared_from_this(), literals]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->NotIn(literals); + }); +} + +Result> RangeBitmapFileIndexReader::VisitIsNull() { + return std::make_shared( + [self = shared_from_this()]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->IsNull(); + }); +} + +Result> RangeBitmapFileIndexReader::VisitIsNotNull() { + return std::make_shared( + [self = shared_from_this()]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->IsNotNull(); + }); +} + +Result> RangeBitmapFileIndexReader::VisitGreaterThan( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Gt(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitLessThan( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Lt(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitGreaterOrEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Gte(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitLessOrEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Lte(literal); + }); +} + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h new file mode 100644 index 00000000..21c8d412 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h @@ -0,0 +1,107 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/bitmap/bitmap_file_index.h" +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" +#include "paimon/file_index/file_index_reader.h" +#include "paimon/file_index/file_index_writer.h" +#include "paimon/file_index/file_indexer.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +class RangeBitmapFileIndexWriter; +class RangeBitmapFileIndexReader; + +class RangeBitmapFileIndex final : public FileIndexer { + public: + explicit RangeBitmapFileIndex(const std::map& options); + + ~RangeBitmapFileIndex() override = default; + + Result> CreateReader( + ArrowSchema* arrow_schema, int32_t start, int32_t length, + const std::shared_ptr& input_stream, + const std::shared_ptr& pool) const override; + + Result> CreateWriter( + ArrowSchema* arrow_schema, const std::shared_ptr& pool) const override; + + static constexpr char CHUNK_SIZE[] = "chunk-size"; + + private: + std::map options_; +}; + +class RangeBitmapFileIndexWriter final : public FileIndexWriter { + public: + static Result> Create( + const std::shared_ptr& arrow_schema, const std::string& field_name, + const std::map& options, const std::shared_ptr& pool); + + Status AddBatch(ArrowArray* batch) override; + Result> SerializedBytes() const override; + + RangeBitmapFileIndexWriter(const std::shared_ptr& arrow_type, + FieldType field_type, + const std::map& options, + const std::shared_ptr& pool, int64_t chunk_size, + const std::shared_ptr& key_factory, + std::unique_ptr appender); + std::shared_ptr arrow_type_; + FieldType field_type_; + std::map options_; + std::shared_ptr pool_; + std::shared_ptr key_factory_; + int64_t chunk_size_; + std::unique_ptr appender_; +}; + +class RangeBitmapFileIndexReader final + : public FileIndexReader, + public std::enable_shared_from_this { + public: + static Result> Create( + const std::shared_ptr& arrow_type, int32_t start, int32_t length, + const std::shared_ptr& input_stream, const std::shared_ptr& pool); + + private: + explicit RangeBitmapFileIndexReader(std::unique_ptr range_bitmap); + + Result> VisitEqual(const Literal& literal) override; + Result> VisitNotEqual(const Literal& literal) override; + Result> VisitIn(const std::vector& literals) override; + Result> VisitNotIn( + const std::vector& literals) override; + Result> VisitIsNull() override; + Result> VisitIsNotNull() override; + Result> VisitGreaterThan(const Literal& literal) override; + Result> VisitLessThan(const Literal& literal) override; + Result> VisitGreaterOrEqual(const Literal& literal) override; + Result> VisitLessOrEqual(const Literal& literal) override; + + std::unique_ptr range_bitmap_; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp new file mode 100644 index 00000000..6a10bc24 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp @@ -0,0 +1,32 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h" + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" + +namespace paimon { + +RangeBitmapFileIndexFactory::~RangeBitmapFileIndexFactory() = default; + +Result> RangeBitmapFileIndexFactory::Create( + const std::map& options) const { + return std::make_unique(options); +} + +REGISTER_PAIMON_FACTORY(RangeBitmapFileIndexFactory); + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h new file mode 100644 index 00000000..402a927e --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h @@ -0,0 +1,41 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/file_index/file_indexer.h" +#include "paimon/file_index/file_indexer_factory.h" +#include "paimon/result.h" + +namespace paimon { + +class PAIMON_EXPORT RangeBitmapFileIndexFactory final : public FileIndexerFactory { + public: + const char* Identifier() const override { + return "range-bitmap"; + } + + ~RangeBitmapFileIndexFactory() override; + + Result> Create( + const std::map& options) const override; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp new file mode 100644 index 00000000..d84ebd8f --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp @@ -0,0 +1,482 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" + +#include + +#include + +#include "arrow/api.h" +#include "arrow/c/bridge.h" +#include "paimon/file_index/bitmap_index_result.h" +#include "paimon/file_index/file_index_format.h" +#include "paimon/file_index/file_indexer_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/predicate/literal.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +class RangeBitmapFileIndexTest : public ::testing::Test { + public: + void SetUp() override { + pool_ = GetDefaultPool(); + fs_ = std::make_shared(); + } + + void TearDown() override { + index_buffer_.reset(); + pool_.reset(); + fs_.reset(); + } + + static void CheckResult(const std::shared_ptr& result, + const std::vector& expected) { + const auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap32* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + const RoaringBitmap32 expected_bitmap = RoaringBitmap32::From(expected); + ASSERT_EQ(*bitmap, expected_bitmap) + << "result=" << bitmap->ToString() << ", expected=" << expected_bitmap.ToString(); + } + + protected: + std::shared_ptr pool_; + + private: + std::shared_ptr fs_; + std::shared_ptr index_buffer_; +}; + +// Helper function to create writer, serialize, and create reader +template +Result> CreateReaderForTest( + RangeBitmapFileIndexTest* test, const std::shared_ptr& arrow_type, + const std::vector& test_data, PAIMON_UNIQUE_PTR* serialized_bytes_out) { + static const std::map kEmptyOptions; + return CreateReaderForTest(test, arrow_type, test_data, kEmptyOptions, + serialized_bytes_out); +} + +// Overload with options to exercise writer configuration such as chunk size. +template +Result> CreateReaderForTest( + RangeBitmapFileIndexTest* test, const std::shared_ptr& arrow_type, + const std::vector& test_data, const std::map& options, + PAIMON_UNIQUE_PTR* serialized_bytes_out) { + // Create Arrow array from test data + auto builder = std::make_shared(); + auto status = builder->AppendValues(test_data); + if (!status.ok()) { + return Status::Invalid("Failed to append values: " + status.ToString()); + } + std::shared_ptr arrow_array; + status = builder->Finish(&arrow_array); + if (!status.ok()) { + return Status::Invalid("Failed to finish builder: " + status.ToString()); + } + const auto c_array = std::make_unique<::ArrowArray>(); + status = arrow::ExportArray(*arrow_array, c_array.get()); + if (!status.ok()) { + return Status::Invalid("Failed to export array: " + status.ToString()); + } + // Create schema for the field + const auto schema = arrow::schema({arrow::field("test_field", arrow_type)}); + // Create writer + PAIMON_ASSIGN_OR_RAISE(const auto writer, RangeBitmapFileIndexWriter::Create( + schema, "test_field", options, test->pool_)); + // Add the batch + PAIMON_RETURN_NOT_OK(writer->AddBatch(c_array.get())); + // Get serialized payload + PAIMON_ASSIGN_OR_RAISE(auto serialized_bytes, writer->SerializedBytes()); + if (!serialized_bytes || serialized_bytes->size() == 0) { + return Status::Invalid("Serialized bytes is empty"); + } + *serialized_bytes_out = std::move(serialized_bytes); + const auto input_stream = std::make_shared( + (*serialized_bytes_out)->data(), (*serialized_bytes_out)->size()); + PAIMON_ASSIGN_OR_RAISE(auto reader, + RangeBitmapFileIndexReader::Create( + arrow_type, 0, static_cast((*serialized_bytes_out)->size()), + input_stream, test->pool_)); + return reader; +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexMultiChunk) { + // Use many distinct values and a very small chunk size to force multiple + // dictionary chunks when writing the range bitmap index. + std::vector test_data; + test_data.reserve(100); + for (int32_t i = 0; i < 100; ++i) { + test_data.push_back(i); + } + + const auto& arrow_type = arrow::int32(); + std::map options; + // Configure a very small chunk size in bytes so that the dictionary must + // spill into multiple chunks. + options[RangeBitmapFileIndex::CHUNK_SIZE] = "99b"; + + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, + (CreateReaderForTest( + this, arrow_type, test_data, options, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_0_result, reader->VisitEqual(Literal(static_cast(0)))); + CheckResult(eq_0_result, {0}); + + ASSERT_OK_AND_ASSIGN(auto eq_50_result, reader->VisitEqual(Literal(static_cast(50)))); + CheckResult(eq_50_result, {50}); + ASSERT_OK_AND_ASSIGN(auto eq_51_result, reader->VisitEqual(Literal(static_cast(51)))); + CheckResult(eq_51_result, {51}); + ASSERT_OK_AND_ASSIGN(auto eq_99_result, reader->VisitEqual(Literal(static_cast(99)))); + CheckResult(eq_99_result, {99}); + + ASSERT_OK_AND_ASSIGN(auto gt_49_result, + reader->VisitGreaterThan(Literal(static_cast(49)))); + // Positions 50..99 + std::vector expected_gt_49; + expected_gt_49.reserve(50); + for (int32_t i = 50; i < 100; ++i) { + expected_gt_49.push_back(i); + } + CheckResult(gt_49_result, expected_gt_49); + + ASSERT_OK_AND_ASSIGN(auto lt_10_result, + reader->VisitLessThan(Literal(static_cast(10)))); + // Positions 0..9 + std::vector expected_lt_10; + expected_lt_10.reserve(10); + for (int32_t i = 0; i < 10; ++i) { + expected_lt_10.push_back(i); + } + CheckResult(lt_10_result, expected_lt_10); + + // is_not_null should cover all rows. + std::vector all_positions(100); + for (int32_t i = 0; i < 100; ++i) { + all_positions[i] = i; + } + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexBigInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int64(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + + // Test equality queries + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); // positions 0 and 2 have value 10 + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); // positions 1 and 4 have value 20 + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); // position 3 has value 30 + ASSERT_OK_AND_ASSIGN(auto eq_40_result, reader->VisitEqual(Literal(static_cast(40)))); + CheckResult(eq_40_result, {5}); // position 5 has value 40 + ASSERT_OK_AND_ASSIGN(auto eq_50_result, reader->VisitEqual(Literal(static_cast(50)))); + CheckResult(eq_50_result, {6}); // position 6 has value 50 + + // Test range queries + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35: 10, 20, 10, 30, 20 + ASSERT_OK_AND_ASSIGN(auto gte_20_result, + reader->VisitGreaterOrEqual(Literal(static_cast(20)))); + CheckResult(gte_20_result, {1, 3, 4, 5, 6}); // values >= 20: 20, 30, 20, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lte_40_result, + reader->VisitLessOrEqual(Literal(static_cast(40)))); + CheckResult(lte_40_result, {0, 1, 2, 3, 4, 5}); // values <= 40: 10, 20, 10, 30, 20, 40 + + // Test IN queries + std::vector in_values = {Literal(static_cast(10)), + Literal(static_cast(30))}; + ASSERT_OK_AND_ASSIGN(auto in_result, reader->VisitIn(in_values)); + CheckResult(in_result, {0, 2, 3}); // positions with values 10 or 30 + ASSERT_OK_AND_ASSIGN(auto not_in_result, reader->VisitNotIn(in_values)); + CheckResult(not_in_result, {1, 4, 5, 6}); // positions with values NOT 10 or 30 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); // no null values + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); // all positions are not null +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + + // Test equality queries + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); + + // Test range queries + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); + ASSERT_OK_AND_ASSIGN(auto gte_20_result, + reader->VisitGreaterOrEqual(Literal(static_cast(20)))); + CheckResult(gte_20_result, {1, 3, 4, 5, 6}); + ASSERT_OK_AND_ASSIGN(auto lte_40_result, + reader->VisitLessOrEqual(Literal(static_cast(40)))) + CheckResult(lte_40_result, {0, 1, 2, 3, 4, 5}); + + // Test empty result cases for INT values that don't exist + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_int_result, + reader->VisitEqual(Literal(static_cast(25)))); + CheckResult(eq_nonexistent_int_result, {}); // 25 doesn't exist in data {10,20,30,40,50} + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_high_int_result, + reader->VisitEqual(Literal(static_cast(100)))); + CheckResult(eq_out_of_range_high_int_result, {}); // Value above maximum (50) + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_low_int_result, + reader->VisitEqual(Literal(static_cast(5)))); + CheckResult(eq_out_of_range_low_int_result, {}); // Value below minimum (10) + + // Test NotEqual operations + ASSERT_OK_AND_ASSIGN(auto ne_10_result, + reader->VisitNotEqual(Literal(static_cast(10)))); + CheckResult(ne_10_result, {1, 3, 4, 5, 6}); // All positions except {0, 2} where 10 appears + + ASSERT_OK_AND_ASSIGN(auto ne_nonexistent_result, + reader->VisitNotEqual(Literal(static_cast(99)))); + CheckResult(ne_nonexistent_result, {0, 1, 2, 3, 4, 5, 6}); // All positions (non-empty result) + + // Test NotIn operations + ASSERT_OK_AND_ASSIGN(auto not_in_single_result, + reader->VisitNotIn({Literal(static_cast(10))})); + CheckResult(not_in_single_result, {1, 3, 4, 5, 6}); // All positions except where 10 appears + + ASSERT_OK_AND_ASSIGN( + auto not_in_multiple_result, + reader->VisitNotIn({Literal(static_cast(10)), Literal(static_cast(20))})); + CheckResult(not_in_multiple_result, {3, 5, 6}); // Positions not containing 10 or 20 + + ASSERT_OK_AND_ASSIGN(auto not_in_nonexistent_result, + reader->VisitNotIn({Literal(static_cast(99))})); + CheckResult(not_in_nonexistent_result, + {0, 1, 2, 3, 4, 5, 6}); // All positions (non-empty result) + + // Test NotIn with empty result - all values are NOT IN the complete set + std::vector all_values = { + Literal(static_cast(10)), Literal(static_cast(20)), + Literal(static_cast(30)), Literal(static_cast(40)), + Literal(static_cast(50))}; + ASSERT_OK_AND_ASSIGN(auto not_in_all_result, reader->VisitNotIn(all_values)); + CheckResult(not_in_all_result, + {}); // Empty result - no positions left when excluding all existing values +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexSmallInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int16(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexTinyInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int8(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexBoolean) { + std::vector test_data = {true, false, true, true, false, true, false}; + const auto& arrow_type = arrow::boolean(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_true_result, reader->VisitEqual(Literal(true))); + CheckResult(eq_true_result, {0, 2, 3, 5}); // positions with value true + ASSERT_OK_AND_ASSIGN(auto eq_false_result, reader->VisitEqual(Literal(false))); + CheckResult(eq_false_result, {1, 4, 6}); // positions with value false + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexFloat) { + std::vector test_data = {10.5f, 20.3f, 10.5f, 30.7f, 20.3f, 40.1f, 50.9f}; + const auto& arrow_type = arrow::float32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_5_result, reader->VisitEqual(Literal(10.5f))); + CheckResult(eq_10_5_result, {0, 2}); // positions with value 10.5 + ASSERT_OK_AND_ASSIGN(auto eq_20_3_result, reader->VisitEqual(Literal(20.3f))); + CheckResult(eq_20_3_result, {1, 4}); // positions with value 20.3 + ASSERT_OK_AND_ASSIGN(auto eq_30_7_result, reader->VisitEqual(Literal(30.7f))); + CheckResult(eq_30_7_result, {3}); // position with value 30.7 + ASSERT_OK_AND_ASSIGN(auto gt_24_9_result, reader->VisitGreaterThan(Literal(24.9f))); + CheckResult(gt_24_9_result, {3, 5, 6}); // values > 25.0: 30.7, 40.1, 50.9 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, reader->VisitLessThan(Literal(35.0f))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35.0 + + // Test empty result cases for float values that don't exist + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_float_result, reader->VisitEqual(Literal(25.0f))); + CheckResult(eq_nonexistent_float_result, {}); // 25.0 doesn't exist in data + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_high_result, reader->VisitEqual(Literal(100.0f))); + CheckResult(eq_out_of_range_high_result, {}); // Value above maximum + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_low_result, reader->VisitEqual(Literal(5.0f))); + CheckResult(eq_out_of_range_low_result, {}); // Value below minimum + + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexDouble) { + std::vector test_data = {10.5, 20.3, 10.5, 30.7, 20.3, 40.1, 50.9}; + const auto& arrow_type = arrow::float64(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_5_result, reader->VisitEqual(Literal(10.5))); + CheckResult(eq_10_5_result, {0, 2}); // positions with value 10.5 + ASSERT_OK_AND_ASSIGN(auto eq_20_3_result, reader->VisitEqual(Literal(20.3))); + CheckResult(eq_20_3_result, {1, 4}); // positions with value 20.3 + ASSERT_OK_AND_ASSIGN(auto eq_30_7_result, reader->VisitEqual(Literal(30.7))); + CheckResult(eq_30_7_result, {3}); // position with value 30.7 + ASSERT_OK_AND_ASSIGN(auto gt_24_9_result, reader->VisitGreaterThan(Literal(24.9))); + CheckResult(gt_24_9_result, {3, 5, 6}); // values > 25.0: 30.7, 40.1, 50.9 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, reader->VisitLessThan(Literal(35.0))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35.0 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexDate) { + std::vector test_data = {42432, 24649, 42432, 38001, 24649, 50000, 12000}; + const auto& arrow_type = arrow::date32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_42432_result, reader->VisitEqual(Literal(FieldType::DATE, 42432))); + CheckResult(eq_42432_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_24649_result, reader->VisitEqual(Literal(FieldType::DATE, 24649))); + CheckResult(eq_24649_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_38001_result, reader->VisitEqual(Literal(FieldType::DATE, 38001))); + CheckResult(eq_38001_result, {3}); + ASSERT_OK_AND_ASSIGN(auto gt_result, + reader->VisitGreaterOrEqual(Literal(FieldType::DATE, 30000))); + CheckResult(gt_result, {0, 2, 3, 5}); // 42432, 38001, 50000 + + ASSERT_OK_AND_ASSIGN(auto lt_result, reader->VisitLessThan(Literal(FieldType::DATE, 40000))); + CheckResult(lt_result, {1, 3, 4, 6}); // 24649, 38001, 12000 + + // Test empty result cases - values that don't exist in the data + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_low_result, + reader->VisitEqual(Literal(FieldType::DATE, 47432))); + CheckResult(eq_nonexistent_low_result, {}); + + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_mid_result, + reader->VisitEqual(Literal(FieldType::DATE, 30000))); + CheckResult(eq_nonexistent_mid_result, {}); // Value in middle range but doesn't exist + + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_high_result, + reader->VisitEqual(Literal(FieldType::DATE, 60000))); + CheckResult(eq_nonexistent_high_result, {}); // Value above maximum (50000) + + // Test range queries that should return empty results + ASSERT_OK_AND_ASSIGN(auto gt_all_result, + reader->VisitGreaterOrEqual(Literal(FieldType::DATE, 60000))); + CheckResult(gt_all_result, {}); // Greater than maximum should return empty + + ASSERT_OK_AND_ASSIGN(auto lt_all_result, + reader->VisitLessThan(Literal(FieldType::DATE, 10000))); + CheckResult(lt_all_result, {}); // Less than minimum should return empty + + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +} // namespace paimon::test diff --git a/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp new file mode 100644 index 00000000..cb8e6bdd --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp @@ -0,0 +1,195 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" + +#include "fmt/format.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/memory/bytes.h" + +namespace paimon { + +Result> LiteralSerializationUtils::CreateValueWriter( + const FieldType field_type, const std::shared_ptr& output_stream) { + switch (field_type) { + case FieldType::BOOLEAN: + return std::function( + [output_stream](const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::TINYINT: + return std::function( + [output_stream](const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::SMALLINT: + return std::function( + [output_stream](const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::DATE: + case FieldType::INT: + return std::function( + [output_stream](const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::BIGINT: + return std::function( + [output_stream](const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::FLOAT: + return std::function( + [output_stream](const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::DOUBLE: + return std::function( + [output_stream](const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::STRING: { + return std::function( + [output_stream](const Literal& literal) -> Status { + const auto value = literal.GetValue(); + output_stream->WriteValue(static_cast(value.size())); + output_stream->Write(value.data(), value.size()); + return Status::OK(); + }); + } + default: + return Status::Invalid( + fmt::format("Unsupported field type for literal serialization: {}", + FieldTypeUtils::FieldTypeToString(field_type))); + } +} + +Result()>> LiteralSerializationUtils::CreateValueReader( + FieldType field_type, const std::shared_ptr& input_stream, MemoryPool* pool) { + switch (field_type) { + case FieldType::BOOLEAN: { + return std::function()>([input_stream]() -> Result { + PAIMON_ASSIGN_OR_RAISE(bool value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::TINYINT: { + return std::function()>([input_stream]() -> Result { + PAIMON_ASSIGN_OR_RAISE(int8_t value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::SMALLINT: { + return std::function()>([input_stream]() -> Result { + PAIMON_ASSIGN_OR_RAISE(int16_t value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::DATE: { + return std::function()>([input_stream]() -> Result { + PAIMON_ASSIGN_OR_RAISE(int32_t value, input_stream->ReadValue()); + return Literal(FieldType::DATE, value); + }); + } + case FieldType::INT: { + return std::function()>([input_stream]() -> Result { + PAIMON_ASSIGN_OR_RAISE(int32_t value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::BIGINT: { + return std::function()>([input_stream]() -> Result { + PAIMON_ASSIGN_OR_RAISE(int64_t value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::FLOAT: { + return std::function()>([input_stream]() -> Result { + PAIMON_ASSIGN_OR_RAISE(float value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::DOUBLE: { + return std::function()>([input_stream]() -> Result { + PAIMON_ASSIGN_OR_RAISE(double value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::STRING: { + return std::function()>( + [input_stream, field_type, pool]() -> Result { + PAIMON_ASSIGN_OR_RAISE(int32_t length, input_stream->ReadValue()); + auto bytes = Bytes::AllocateBytes(length, pool); + PAIMON_RETURN_NOT_OK(input_stream->ReadBytes(bytes.get())); + return Literal(field_type, bytes->data(), bytes->size()); + }); + } + default: + return Status::Invalid( + fmt::format("Unsupported field type for literal deserialization: {}", + FieldTypeUtils::FieldTypeToString(field_type))); + } +} + +Result LiteralSerializationUtils::GetFixedFieldSize(const FieldType& field_type) { + switch (field_type) { + case FieldType::BOOLEAN: + case FieldType::TINYINT: + return sizeof(int8_t); + case FieldType::SMALLINT: + return sizeof(int16_t); + case FieldType::DATE: + case FieldType::INT: + return sizeof(int32_t); + case FieldType::BIGINT: + return sizeof(int64_t); + case FieldType::FLOAT: + return sizeof(float); + case FieldType::DOUBLE: + return sizeof(double); + default: + return Status::Invalid(fmt::format("Unsupported field type for GetFixedFieldSize: {}", + FieldTypeUtils::FieldTypeToString(field_type))); + } +} +Result LiteralSerializationUtils::GetSerializedSizeInBytes(const Literal& literal) { + switch (literal.GetType()) { + case FieldType::BOOLEAN: + case FieldType::TINYINT: + case FieldType::SMALLINT: + case FieldType::DATE: + case FieldType::INT: + case FieldType::BIGINT: + case FieldType::DOUBLE: + case FieldType::FLOAT: + return GetFixedFieldSize(literal.GetType()); + case FieldType::STRING: + return static_cast(sizeof(int32_t) + literal.GetValue().size()); + default: + return Status::Invalid( + fmt::format("Unsupported field type for GetSerializedSizeInBytes: {}", + FieldTypeUtils::FieldTypeToString(literal.GetType()))); + } +} + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h new file mode 100644 index 00000000..93f247db --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h @@ -0,0 +1,48 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +class LiteralSerializationUtils { + public: + LiteralSerializationUtils() = delete; + + ~LiteralSerializationUtils() = delete; + + static Result()>> CreateValueReader( + FieldType field_type, const std::shared_ptr& input_stream, + MemoryPool* pool = nullptr); + + static Result> CreateValueWriter( + FieldType field_type, const std::shared_ptr& output_stream); + + static Result GetFixedFieldSize(const FieldType& field_type); + + static Result GetSerializedSizeInBytes(const Literal& literal); +}; + +} // namespace paimon diff --git a/src/paimon/common/io/data_input_stream.cpp b/src/paimon/common/io/data_input_stream.cpp index 32df0254..44fa2f5c 100644 --- a/src/paimon/common/io/data_input_stream.cpp +++ b/src/paimon/common/io/data_input_stream.cpp @@ -122,4 +122,5 @@ template Result DataInputStream::ReadValue() const; template Result DataInputStream::ReadValue() const; template Result DataInputStream::ReadValue() const; template Result DataInputStream::ReadValue() const; +template Result DataInputStream::ReadValue() const; } // namespace paimon diff --git a/test/inte/read_inte_with_index_test.cpp b/test/inte/read_inte_with_index_test.cpp index bd26f187..957d62b9 100644 --- a/test/inte/read_inte_with_index_test.cpp +++ b/test/inte/read_inte_with_index_test.cpp @@ -45,6 +45,7 @@ #include "paimon/data/timestamp.h" #include "paimon/defs.h" #include "paimon/factories/factory_creator.h" +#include "paimon/fs/local/local_file_system.h" #include "paimon/memory/bytes.h" #include "paimon/memory/memory_pool.h" #include "paimon/metrics.h" @@ -372,6 +373,295 @@ class ReadInteWithIndexTest : public testing::Test, } } + void CheckResultForRangeBitmap(const std::string& path, + const std::shared_ptr& arrow_data_type, + const std::shared_ptr& split) const { + { + // test with no predicate - return all 8 rows + std::shared_ptr expected_array; + auto array_status = arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, + { + R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 3, 200, 2.2, 22.22, 19725, "row1"] +])", + R"([ +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"] +])", + R"([ +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, null, null, null, null, null, null] +])", + R"([ +[0, null, null, null, null, null, "null_row"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + std::cout << array_status.message() << std::endl; + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, /*predicate=*/nullptr, expected_array); + } + { + // Test equal predicate: f0 = 17 -> row 0 + auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(17)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test less than predicate: f0 < 10 -> rows 1,2,3,4 (values 3,5,7,9) + auto predicate = PredicateBuilder::LessThan(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(10)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test greater than predicate: f0 > 5 -> rows 0,3,4,7 (values 17,7,9,10) + auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(5)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test is null predicate on f0 -> rows 5, 6 + auto predicate = + PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::INT); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, null, null, null, null, null, null], +[0, null, null, null, null, null, "null_row"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test is not null predicate on f0 -> rows 0,1,2,3,4,7 + auto predicate = + PredicateBuilder::IsNotNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::INT); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test in predicate: f0 in (3, 7) -> rows 1, 3 + auto predicate = PredicateBuilder::In( + /*field_index=*/0, /*field_name=*/"f0", FieldType::INT, {Literal(3), Literal(7)}); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test not in predicate: f0 not in (3, 7) -> rows 0,2,4,7 (excluding null rows 5,6) + auto predicate = PredicateBuilder::NotIn( + /*field_index=*/0, /*field_name=*/"f0", FieldType::INT, {Literal(3), Literal(7)}); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] + ])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test f1 (BIGINT) predicates + { + // Test greater than predicate: f1 > 300 -> rows 3,4,7 (values 400,500,600) + auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", + FieldType::BIGINT, Literal(300L)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test f2 (FLOAT) predicates + { + // Test less than predicate: f2 < 4.0 -> rows 0,1,2 (values 1.1,2.2,3.3) + auto predicate = PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2", + FieldType::FLOAT, Literal(4.0f)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + // Test date type + { + // Test greater than predicate: f0 > 5 -> rows 0,3,4,7 (values 17,7,9,10) + auto predicate = + PredicateBuilder::LessOrEqual(/*field_index=*/4, /*field_name=*/"f4", + FieldType::DATE, Literal(FieldType::DATE, 19725)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test f3 (DOUBLE) predicates + { + // Test greater or equal predicate: f3 >= 40.0 -> rows 3,4,7 (values 44.44,55.55,66.66) + auto predicate = PredicateBuilder::GreaterOrEqual( + /*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, Literal(44.44)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test BETWEEN predicate on f1 (BIGINT) + { + // Test f1 BETWEEN 200 AND 500 -> rows 1,2,3,4 (values 200,300,400,500) + auto predicate = + PredicateBuilder::Between(/*field_index=*/1, /*field_name=*/"f1", FieldType::BIGINT, + Literal(200L), Literal(500L)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test IN predicate on f2 (FLOAT) + { + // Test f2 IN (1.1, 4.4, 6.6) -> rows 0,3,7 (values 1.1,4.4,6.6) + auto predicate = + PredicateBuilder::In(/*field_index=*/2, /*field_name=*/"f2", FieldType::FLOAT, + {Literal(1.1f), Literal(4.4f), Literal(6.6f)}); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test nested composite: (f0 = 3 OR f0 = 17) AND f1 < 200 + // (f0 = 3 OR f0 = 17): matches rows 0,1 + // f1 < 200: matches rows 0 (f1=100) + // Combined AND: matches rows 0 + auto predicate1 = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(3)); + auto predicate2 = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(17)); + ASSERT_OK_AND_ASSIGN(auto or_predicate, PredicateBuilder::Or({predicate1, predicate2})); + + auto predicate3 = PredicateBuilder::LessThan(/*field_index=*/1, /*field_name=*/"f1", + FieldType::BIGINT, Literal(200L)); + ASSERT_OK_AND_ASSIGN(auto and_predicate, + PredicateBuilder::And({or_predicate, predicate3})); + + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, and_predicate, expected_array); + } + { + // Test AND predicate with mixed types: f0 >= 5 AND f1 > 100 + // f0 >= 5: matches rows 3,4,7 + // f1 > 100: matches rows 2,3,4,7 + // Combined AND: matches rows 3,4,7 + auto predicate1 = PredicateBuilder::GreaterThan(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(5)); + auto predicate2 = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", + FieldType::BIGINT, Literal(100L)); + ASSERT_OK_AND_ASSIGN(auto and_predicate, + PredicateBuilder::And({predicate1, predicate2})); + + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, and_predicate, expected_array); + } + } + void CheckResultForBsi(const std::string& path, const std::shared_ptr& arrow_data_type, const std::shared_ptr split) const { @@ -2072,6 +2362,95 @@ TEST_P(ReadInteWithIndexTest, TestWithIndexWithoutRegistered) { } } +TEST_P(ReadInteWithIndexTest, TestRangeBitmapIndex) { + auto [file_format, enable_prefetch] = GetParam(); + std::string path = + GetDataDir() + file_format + "/append_with_rangebitmap.db/append_with_rangebitmap/"; + std::string file_name; + if (file_format == "orc") { + file_name = "data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc"; + } else if (file_format == "parquet") { + file_name = "data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet"; + } + + std::vector read_fields = {SpecialFields::ValueKind(), + DataField(0, arrow::field("f0", arrow::int32())), + DataField(1, arrow::field("f1", arrow::int64())), + DataField(2, arrow::field("f2", arrow::float32())), + DataField(3, arrow::field("f3", arrow::float64())), + DataField(4, arrow::field("f4", arrow::date32())), + DataField(5, arrow::field("f5", arrow::utf8()))}; + std::shared_ptr arrow_data_type = + DataField::ConvertDataFieldsToArrowStructType(read_fields); + + auto data_file_meta = std::make_shared( + file_name, /*file_size=*/1288, + /*row_count=*/8, /*min_key=*/BinaryRow::EmptyRow(), + /*max_key=*/BinaryRow::EmptyRow(), /*key_stats=*/SimpleStats::EmptyStats(), + /*value_stats=*/SimpleStats::EmptyStats(), /*min_sequence_number=*/0, + /*max_sequence_number=*/7, /*schema_id=*/0, + /*level=*/0, + /*extra_files=*/ + std::vector>({file_name + ".index"}), + /*creation_time=*/Timestamp(0ll, 0), /*delete_row_count=*/0, + /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); + + DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, + /*bucket_path=*/path + "bucket-0/", {data_file_meta}); + ASSERT_OK_AND_ASSIGN(auto split, + builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); + + // Run comprehensive range bitmap index tests + CheckResultForRangeBitmap(path, arrow_data_type, split); +} + +TEST_P(ReadInteWithIndexTest, TestRangeBitmapIndexMultiChunk) { + auto [file_format, enable_prefetch] = GetParam(); + std::string path = + GetDataDir() + file_format + + "/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/"; + std::string file_name; + if (file_format == "orc") { + file_name = "data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc"; + } else if (file_format == "parquet") { + file_name = "data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet"; + } + + std::vector read_fields = {SpecialFields::ValueKind(), + DataField(0, arrow::field("f0", arrow::int32())), + DataField(1, arrow::field("f1", arrow::int64())), + DataField(2, arrow::field("f2", arrow::float32())), + DataField(3, arrow::field("f3", arrow::float64())), + DataField(4, arrow::field("f4", arrow::date32())), + DataField(5, arrow::field("f5", arrow::utf8()))}; + std::shared_ptr arrow_data_type = + DataField::ConvertDataFieldsToArrowStructType(read_fields); + + auto data_file_meta = std::make_shared( + file_name, /*file_size=*/1413, + /*row_count=*/8, /*min_key=*/BinaryRow::EmptyRow(), + /*max_key=*/BinaryRow::EmptyRow(), /*key_stats=*/SimpleStats::EmptyStats(), + /*value_stats=*/SimpleStats::EmptyStats(), /*min_sequence_number=*/0, + /*max_sequence_number=*/7, /*schema_id=*/0, + /*level=*/0, + /*extra_files=*/ + std::vector>({file_name + ".index"}), + /*creation_time=*/Timestamp(0ll, 0), /*delete_row_count=*/0, + /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); + + DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, + /*bucket_path=*/path + "bucket-0/", {data_file_meta}); + ASSERT_OK_AND_ASSIGN(auto split, + builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); + + // Run range bitmap index tests with multi-chunk test data + CheckResultForRangeBitmap(path, arrow_data_type, split); +} + TEST_P(ReadInteWithIndexTest, TestWithIOException) { auto [file_format, enable_prefetch] = GetParam(); std::string path = GetDataDir() + "/" + file_format + diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README new file mode 100644 index 00000000..52eb2755 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README @@ -0,0 +1,16 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc new file mode 100644 index 0000000000000000000000000000000000000000..fe2d4db19ca7ffda1fe7e110c3c30baaa73cb10f GIT binary patch literal 1024 zcmeYdau#G@;9?VE;ot~hFa|Qkx!4&XK!}HfO^A(yO+ZN^Er8JyC?|!G;|I!da-3ik z*uW|ALLq=L8Yr#J#Rk=>$iXHh%?aczm^t&KJ zv`}*pj!n?BD?5yE&wRz%g&AcmPD7 zECWLv>nyWbQnTb{Da}%wr2#VSw<7}s&}{}lYzV|gKx_=eCNP%++2-5~3`{I;j4gK< z7&Nx(|5ezd%gj*J$}ec(`zlslOH$S zW=e91yTCRfF+7sVlaYaIAtM(!ZEyjD9FZC#1h@pzQa}VBmomXUl-n}ZVcG;;D*|?1D`V& zYUyyl%eI~}<<(yIjmFR0zTZ=Ms62I5#rsSq<6jO_AMBgPr#JKTLDtE(CUsog!rKnl z>Rz{;=yPMfhgw^JN79qq!5!zN8h#%NZEMk_&_%XqtQg8#@c%!OlXKK7lMK zvG5`6*;@JR_-N+f zJ7y?Lx7R;Ig@+){4d;?De&C?Z@jS0FVdci3@ z(XdKwbrWpn2BHw}i1~+PD6JeHd3G2(ImjPAw3x(r>(rU^4myVy)_+6 z%pvoqV+YpeS1>OYo8j8k^NV^9_*10JkP+s_nnyEKS!C`8fntSwA#9a4b<~)lnpoQT zyc~N_+qbLLn-Bh5DKo4`Hg%_GsEW3VGF8;3)wdoiq*(PfbsXQrFegMfiat_i2rnWc Yie?CV4}jeZv2`J~Ojl`5NB`TyA2(TMPyhe` literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 new file mode 100644 index 0000000000000000000000000000000000000000..854ee21bfff8f949e412df3669b98889678e1559 GIT binary patch literal 2175 zcmeZI%3@>@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8ly1pHC^aU0`fJcy zcjBP$Ri+ip8e8@MGI+$YF&yI3G)(ZlEMXwgvZ!fGQoESagW9X%S2n8sll?pO%d!u% zZ#Ef=F1q_p)qKtT7j2j08kqxc2K|4p&wAqI(N7!0H|z?v5&O-y|E$oPWA=eN_!d?P zB<|qY$rB=`Y5r(|)5BeyzNc^0y%N8%a6!-l1s@9`w|~+$9$xF1Wj+|ozJLGyw`Je4 ztA+k8A(v*kPIMDAQ(QXP$-|8^?k3AYt;JgmmUHUztoAb4KU;u-XU&OX)kd2GLIwgx dVw^lpX^eRZCKFT*cxSNr7$@ZLr7)s<695o+)kXjS literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0 new file mode 100644 index 0000000000000000000000000000000000000000..4786f9eaf474fc77099c83a27d8d3cb2c80bf9ae GIT binary patch literal 1006 zcmbVLO-sWt7*_D?LBy*N@HToAVYj+QET*lj?fe)eWNo%p+H|CuFh~Ext}6Zl{Z$@4 zdGl&B=UUgHbGN2>KJq+oVs&mdxAw>_Pn6W})0n(%kAWBsD5wKY{e%ag1{jGe>vTYT z`7_}?h2-CTstiaNCrp`&^(dp92rB8_kVZiY>h1Bj5^lk!ty^ZkIr9pc09ma7oahK* zQ#D`#Nd*sCuZptu>k=wT7BHr3U{%wghNY%m=q*KR5J6`J?77->bSIC1ZAWzq)j!I^ zxJq17kD;5mr6RX{+|EeQ^hbI_y<7ohx(MNh<(j2awgDSP5y<&{$4t zwhyKqd6`!O0qv4uBtTs*L@G|!r|Vk4SShB=k=VN4TJfX>nR?@DL@`>xq@ z#O7Qp&NwI=OTatRg2>TrxP%m(#o1^b<(i)=s3c#(IKGW^!+-{oHQi;sdyxjTp!Wdm zq2BjQPgp>%r+dq){|FDEb>cno*rtO9y5(1o8%{Va{%~*WA0L2ne9J=Jp6^snc?7c4 zBmpAtdHk3$JQ2~voTjo<^L;SOC@6ylAasaN6An6(Sq5dbS#I|L+B!0gzl19-z4g2A zp|=1nRAlqlBr2Cd8r_&HbS*f5)@HJsQiW>2_n{ie!tGtAtBJ1I{XcS0ZqHs_loJ{~ zJ}WrR2xdgtd2xYmrhCu6>bKLM->-t&bnE5i&W&)>^OIQTxqM3)S7*=QZW27&S%AyCGNKuJ@JgRjFSvT4M$iXHh%?aczm^t&KJ zv`}*pj!n?BD?5yE&wRz%g&AcmPD7 zECWLv>nyWbQnTb{Da}%wr2#VSw<7}s&}{}lYzV|gKx_=eCNP%++2-5~3`{I;j4gK< z7&Nx(|5ezd%gj*J$}ec(`zlslOH$S zW=e91yTCRfF+7sVlaYaIAtM(!ZEyjD9FZC#1h@pzQa}VBmomXUl-n}ZVcG;;D*|?1D`V& zYUyyl%eI~}<<(yIjmFR0zTZ=Ms62I5#rsSq<6jO_AMBgPr#JKTLDtE(CUsog!rKnl z>Rz{;=yPMfhgw^JN79qq!5!zN8h#%NZEMk5~m6k2h!j#xJkHj<>Vlaet|fU zNF4l#Ui|>iGc(&=cGE=Cq;K-fJNtfrbSk`Z`D0}9Q3xRca!)wbS^b?qt(+Vl%DalJ zib9sGmEfdT!{oH(--he7tr>+x?rdTd4y1p_V@4U<`r75&yZZF`GitJRTA8Z#8ExIV zCbq8f2)ph3is7HMQ=3|iwyq|Zx9S(;&ua5wwRZQ#Ut*K3ht{#S8;g}{-GqWNv30d% z(@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8QhNl{!=xtf_^$oR z`mo5N5~fwm8e8@MGI*r2F&y&JG)(ZlEMcJHVS0gw*_($+IzB=2o+oG<& zn}kJ8Zr=9R-6dJXN={WOaqkH{QD0syBM)Zz_;I^i*L} zf#4*gf`vzR2(XKDU-!(2I`%Dm{{dds=}uo49Gd9x>HLSlBqP}e4f8)%)bGE4zR+Y- zU$sKS#ub-*qJq6drW8hPXi`(txEPVx`%1;I^I)f8Cd=pDoB|9yYfcoaHrgBzG7vBl c_deU|+-z>`kz1Z9so$qDdD|WXF&a=%2b}r|4?qnt5?9vg zfcWxf!g~tIzxz}fkT6b|G8OAlMmZ5w(z_vzf)v!-<8LM0LQPw<%zSg^6*2*`S^+rG zbc9W4paPN#9r;Cj)#Qh)H%XX))&QAjB@7$$=uz>Jj@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG z29|uq>ZFgsdY3(>uPj~OG?{4;qsCVKzYHF(Obkga{NfX~=JqeJIlP?Hp=|q{Bi`$k z#b%wh-zc)}{qKFX;^Offb5vdT_!jTTsN{+anlkO2-f874PZqFwTrrwZEaJ$-aNvP4 Ig9y4E0MFQN3;+NC literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 new file mode 100644 index 00000000..096f0300 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 @@ -0,0 +1,44 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file-index.range-bitmap.f1.chunk-size" : "16B", + "file-index.range-bitmap.f2.chunk-size" : "16B", + "file-index.range-bitmap.f3.chunk-size" : "16B", + "file-index.in-manifest-threshold" : "1B", + "file-index.range-bitmap.f4.chunk-size" : "16B", + "file.format" : "orc", + "file-index.range-bitmap.f0.chunk-size" : "16B" + }, + "timeMillis" : 1772188734852 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 new file mode 100644 index 00000000..1e8a9f72 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1", + "deltaManifestListSize" : 1106, + "commitUser" : "162120aa-5242-438d-bb0e-96ee933b3313", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772188737678, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README new file mode 100644 index 00000000..52eb2755 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README @@ -0,0 +1,16 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..19fabdcaf2da847eadff9b7f2b5a97b2fd7ea90a GIT binary patch literal 1397 zcmZXUO=uHQ5XWaXn{*RXnquE$7nhQQ4H2|vv-zkHbv-DhBE*k^2eGF4z)JdIYO9Ec zQY}I$#Y4eTDjvL);-QBsJ(UU~1=~^-wKp%~MFi1;Ad1f0&%``3yKm;@{r3N6W;c7{ zoJxRX$OZD~?&ka+g@n-c5fLEz?C>Ax{06`Sfah^DyP|o~8qoY`0W=N(ca+H$vN|=f z+f)bG&%ugGD8M=#XMdsu1Lilyp!bpVCWgygF^g zvlf|2dpS1$*H;I2r0qKc(1_NA){M3ntp&}LhQe->A73XQ`s)Z&929sXNgoBwOJ(q8 zOpR*^ECLLSl*>IBu;OV!5cD-}^rRo5ax*VW0xJsMSbYdDQ^5?21@zgFCDG1z)w00S zh`U(ks=}h38{B24C0o5Lu+!l#cF^iwwn$t;scE*vgRD@IjB}J}Q0OE_rtSGG@5bjm zWZ8~y32uDbOBPvNraF7nozBLgTwnj-K+kagYOy~@f*5V%CB78vMlFIdRl9j0qA`tk z3qS(KRO|NH|4p<7lcG{aMI>mfCEFkwP!tT%kgUk%2q^flr0By_BANtgh+d=PTtv*6 z88*{Rg{YBl)5GK88-UM%404|P38lY$q#YPcaA2wdo8#y-7n$TTZdD|wRETwI5w(?rNRoQ7kL0WK@}nb-h~JCr10+V2KSlG|iyRb99-D%z03(-3*e` zAxvz2v=;xH;b%lktee<`D|Smjv8m$V*hx#kw(PCw|Z*U;2 zwyUZ3SUB1{(pN5o6R~(+>ri8PEgdh!kECK+QSDIEX*HKG_LhqId|K@-mQv~gxr|Nq N-w)RD70==a_aEa1`>6l` literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet.index b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet.index new file mode 100644 index 0000000000000000000000000000000000000000..18ea5e60dfcf54e54b3f8b2942f438dc5f5712ae GIT binary patch literal 1288 zcmc&zPb)-G6u_&_%XqtQg8#@c%!OlXKK7lMK zvG5`6*;@JR_-N+f zJ7y?Lx7R;Ig@+){4d;?De&C?Z@jS0FVdci3@ z(XdKwbrWpn2BHw}i1~+PD6JeHd3G2(ImjPAw3x(r>(rU^4myVy)_+6 z%pvoqV+YpeS1>OYo8j8k^NV^9_*10JkP+s_nnyEKS!C`8fntSwA#9a4b<~)lnpoQT zyc~N_+qbLLn-Bh5DKo4`Hg%_GsEW3VGF8;3)wdoiq*(PfbsXQrFegMfiat_i2rnWc Yie?CV4}jeZv2`J~Ojl`5NB`TyA2(TMPyhe` literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-75f07296-b729-48db-aadd-17826f0aadf9-0 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-75f07296-b729-48db-aadd-17826f0aadf9-0 new file mode 100644 index 0000000000000000000000000000000000000000..c9299ed9ad03da89fe46695c7ec07c2c1946f320 GIT binary patch literal 2180 zcmeZI%3@>@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8_|yMKO>$ATaL;7# z=SOP9n<+hM-!27#2ENQmOa8ylcWw84&XWB6LgvPfS9J&Vw=mEC7(UzEcY~6h zjDego)3g&NZ;vph7q1kV+I8&HY5N2Yrff&ADH$3i|Ku$koaP0y{g~hTy=?bg9sv)d zkm5jnpD%Zk7tAwR`6a1_&n#ue>UlkVhm=YuIC&*WmV`f>b+cHSOMt;5qWyUZQ)Po{ gf8R1mY`w@-99H`ydw-WBawqw|q*j#u;DF;QP1e_TN zVoNh&3CTH&(m@^N7}ph4oULFi&%~OpLmf*^r_@`E)FFcI2G|R&?HaC#fE`zJOVvNh z!?;ddQ;%s_xT7JjdfeWW)BHzzQ@h##WqAnUrtMjkQ+7c*j$@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG zhNo{Ie~t<`09bt625z484XV(TuvV0U}9o8 L@W7Zs1l<+@HCS~J literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 new file mode 100644 index 00000000..cdeb25d1 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 @@ -0,0 +1,38 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file-index.in-manifest-threshold" : "1B" + }, + "timeMillis" : 1772163669686 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 new file mode 100644 index 00000000..fee41027 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1", + "deltaManifestListSize" : 1108, + "commitUser" : "95859ce1-495d-4176-8f68-f7fbd595554c", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772163672630, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README new file mode 100644 index 00000000..f308976e --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README @@ -0,0 +1,17 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 +range-bitmap index chunk-size: 16B + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..67d9aea94bfb25060e9bc55c537a899393cc25d7 GIT binary patch literal 1397 zcmZXUO=uHQ5XWaXn{*RXnquE$7nhQQ4H2|vv-zkHbv-DhBE*k^2eGF4z)JdIYO9Ec zQY}I$#Y4eTDjvL);-QBsJ(UU~1=~^-wKp%~MFi1;Ad1f0&%``3yKm;@{r3N6W;c7{ zoJxRX$OZD~?&ka+g@n-c5fLEz?C>Ax{06`Sfah^DyP|o~8qoY`0W=N(ca+H$vN|=f z+f)bG&%ugGD8M=#XMdsu1Lilyp!bpVCWgygF^g zvlf|2dpS1$*H;I2r0qKc(1_NA){M3ntp&}LhQe->A73XQ`s)Z&929sXNgoBwOJ(q8 zOpR*^ECLLSl*>IBu;OV!5cD-}^rRo5ax*VW0xJsMSbYdDQ^5?21@zgFCDG1z)w00S zh`U(ks=}h38{B24C0o5Lu+!l#cF^iwwn$t;scE*vgRD@IjB}J}Q0OE_rtSGG@5bjm zWZ8~y32uDbOBPvNraF7nozBLgTwnj-K+kagYOy~@f*5V%CB78vMlFIdRl9j0qA`tk z3qS(KRO|NH|4p<7lcG{aMI>mfCEFkwP!tT%kgUk%2q^flqzFElp-GU2=ruadMZ}Dm zVKd!Sh#L7eJv<)10r(8aAm_QCQ2NV9+JV6Y2c{aZIgU1+HMT=Zy+nY)8 zQKJhb(QeBs40aRHrvcJR$NVG(biqeM5s`lPwcrP@YbaN_K2p4KsJ{{(Emns61_#1w zyP9f`g`>SAedR(p5sT-w4mFn7((yw4NGhfk)ebeCR&)7cZ>gBir`6tKDWx8e%h*)^ N{a_tm@hpCD{{iCS`>6l` literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet.index b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet.index new file mode 100644 index 0000000000000000000000000000000000000000..343734e7502e925b676741559eeb0ac746da6502 GIT binary patch literal 1413 zcmc&!O)mpM7@pbgYS9K!;v*a!9E1dmNJY%aMYbU>5~m6k2h!j#xJkHj<>Vlaet|fU zNF4l#Ui|>iGc(&=cGE=Cq;K-fJNtfrbSk`Z`D0}9Q3xRca!)wbS^b?qt(+Vl%DalJ zib9sGmEfdT!{oH(--he7tr>+x?rdTd4y1p_V@4U<`r75&yZZF`GitJRTA8Z#8ExIV zCbq8f2)ph3is7HMQ=3|iwyq|Zx9S(;&ua5wwRZQ#Ut*K3ht{#S8;g}{-GqWNv30d% z(@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8W-s+871Uil3>0u{T_oxf`n{R$X>ZOH=<10SRToBrcS%l z^jT42i`aCTRz?21*A|99V7R2>$@u^EYgc!Tr7bS(N4=X3eg8Ivd$NZ-uAhI_XM>8J zoQ0ek(=`rP&7+JjHqR8D+2j5>y#9b7%QYv{Mxh{vpY0Dg1b^-_Iv}6BX7kOQw1$F9 zo%4Ej&gonj`((3;&K?`_?Q5CY%)%Y7FKE4!HqqO*r#siW`eep75dj8^i1z0tOqC6) h2?{AXLJ}-zneH6OQ1DKWGT@!D?7$6~6HMrS1pp>T(Wd|a literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 new file mode 100644 index 0000000000000000000000000000000000000000..808abfa58069a65940b5fc34233876327d3e0a47 GIT binary patch literal 1006 zcmbVL%}#?b9G{72561WaJuf@}H(a|&0YT`TC z;v6^@ox4%``{?iYZ|&**(eXKXV3|?`3!0GklNI1)PJs#-4KfyjK1fN@Sr<78+^Cm>zMF%W&P>nDtnQdARhn#f75_d(r} zU-^9y(ivIC9H^36`Bi&Z&fpT68rI=2<;qAG-rx$k8_>o@4!>j3x(qVt!Cc{MpfNHI zvptk5Tu1gzyOK@XuTHlc-HQ7^vR|FmuP(?!8k{~BoTMRPVebRNXFJS$uP;w;L)9-o E0X5o06#xJL literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 new file mode 100644 index 0000000000000000000000000000000000000000..d367884cb55c083d4c00227d38c28ec8d12cc7d8 GIT binary patch literal 1108 zcmeZI%3@>@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG zhE0#p|M2m-t8}xg%F5}@$2O*Aj2c_@|1x-lGBG5v2uMsgYFXb|a;$yEtY7mlxFl_o zUO(a6W?xm-+}P`-uV>AQ;&^Z|OE0=gU-MCCPOa>e@|zztuUayTvFp5d5Sb?7$i#5q KfiZ&!x-9?)#&eti literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 new file mode 100644 index 00000000..d4ca2df4 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 @@ -0,0 +1,43 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file-index.range-bitmap.f1.chunk-size" : "16B", + "file-index.range-bitmap.f2.chunk-size" : "16B", + "file-index.range-bitmap.f3.chunk-size" : "16B", + "file-index.in-manifest-threshold" : "1B", + "file-index.range-bitmap.f4.chunk-size" : "16B", + "file-index.range-bitmap.f0.chunk-size" : "16B" + }, + "timeMillis" : 1772188209180 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 new file mode 100644 index 00000000..4e78d6b5 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1", + "deltaManifestListSize" : 1108, + "commitUser" : "9385bcac-276c-4639-b825-52623beb2a6d", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772188213862, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file From 88b8a0f2f663d043aeb9675aecfec2c0cc913b25 Mon Sep 17 00:00:00 2001 From: xiaoheng Date: Sat, 28 Feb 2026 08:42:26 +0800 Subject: [PATCH 2/6] fix: adjust code style --- .../rangebitmap/bit_slice_index_bitmap.cpp | 60 ++++---- .../rangebitmap/bit_slice_index_bitmap.h | 2 +- .../file_index/rangebitmap/dictionary/chunk.h | 8 +- .../dictionary/chunked_dictionary.cpp | 57 ++++--- .../dictionary/chunked_dictionary.h | 15 +- .../rangebitmap/dictionary/dictionary.h | 2 +- .../dictionary/fixed_length_chunk.cpp | 19 ++- .../dictionary/fixed_length_chunk.h | 20 +-- .../rangebitmap/dictionary/key_factory.cpp | 68 +++------ .../rangebitmap/dictionary/key_factory.h | 22 +-- .../file_index/rangebitmap/range_bitmap.cpp | 94 ++++++------ .../file_index/rangebitmap/range_bitmap.h | 8 +- .../rangebitmap/range_bitmap_file_index.cpp | 44 +++--- .../rangebitmap/range_bitmap_file_index.h | 17 ++- .../range_bitmap_file_index_factory.cpp | 2 +- .../range_bitmap_file_index_factory.h | 2 +- .../range_bitmap_file_index_test.cpp | 26 ++-- .../utils/literal_serialization_utils.cpp | 141 +++++++++++------- .../utils/literal_serialization_utils.h | 20 +-- 19 files changed, 306 insertions(+), 321 deletions(-) diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp index 3005100b..9309a7e3 100644 --- a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,18 +33,18 @@ namespace paimon { Result> BitSliceIndexBitmap::Create( const std::shared_ptr& pool, const std::shared_ptr& input_stream, const int32_t offset) { - const auto data_in = std::make_unique(input_stream); + auto data_in = std::make_unique(input_stream); PAIMON_RETURN_NOT_OK(data_in->Seek(offset)); - PAIMON_ASSIGN_OR_RAISE(const auto header_length, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto version, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); if (version != CURRENT_VERSION) { return Status::Invalid("Unknown BitSliceBitmap Version"); } - PAIMON_ASSIGN_OR_RAISE(const auto slices_size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int8_t slices_size, data_in->ReadValue()); auto slices = std::vector>(); slices.resize(slices_size); - PAIMON_ASSIGN_OR_RAISE(const auto ebm_size, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto indexes_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t ebm_size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t indexes_length, data_in->ReadValue()); auto indexes = Bytes::AllocateBytes(indexes_length, pool.get()); PAIMON_RETURN_NOT_OK(data_in->Read(indexes->data(), indexes_length)); auto body_offset = offset + sizeof(int32_t) + header_length; @@ -74,8 +74,8 @@ BitSliceIndexBitmap::BitSliceIndexBitmap(const std::shared_ptr& pool const int32_t body_offset) : pool_(pool), initialized_(false), - bit_slices_(std::vector>(slices_size, {std::nullopt})), - ebm({std::nullopt}), + bit_slices_(std::vector>(slices_size, std::nullopt)), + ebm(std::nullopt), input_stream_(input_stream), body_offset_(body_offset), indexes_(std::move(indexes)), @@ -96,12 +96,12 @@ Result BitSliceIndexBitmap::GetEmptyBitmap() { Result BitSliceIndexBitmap::GetSliceBitmap(const int32_t idx) { if (!bit_slices_[idx].has_value()) { - const auto data_in = std::make_unique( + auto data_in = std::make_unique( std::make_shared(indexes_->data(), indexes_length_)); const int position = static_cast(2 * sizeof(int32_t) * idx); PAIMON_RETURN_NOT_OK(data_in->Seek(position)); - PAIMON_ASSIGN_OR_RAISE(const auto offset, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t length, data_in->ReadValue()); PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_ + ebm_length_ + offset, FS_SEEK_SET)); RoaringBitmap32 bitmap; const auto bytes = Bytes::AllocateBytes(length, pool_.get()); @@ -118,17 +118,17 @@ Status BitSliceIndexBitmap::LoadSlices(const int32_t start, const int32_t end) { return Status::OK(); } auto indexes_stream = std::make_shared(indexes_->data(), indexes_length_); - const auto data_in = std::make_unique(indexes_stream); + auto data_in = std::make_unique(indexes_stream); const auto position = static_cast(2 * sizeof(int32_t) * start); PAIMON_RETURN_NOT_OK(data_in->Seek(position)); - PAIMON_ASSIGN_OR_RAISE(const auto offset, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(auto length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t length, data_in->ReadValue()); std::vector lengths(end); lengths[start] = length; for (int32_t i = start + 1; i < end; ++i) { PAIMON_RETURN_NOT_OK(data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto slice_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t slice_length, data_in->ReadValue()); lengths[i] = slice_length; length += slice_length; } @@ -148,24 +148,28 @@ Status BitSliceIndexBitmap::LoadSlices(const int32_t start, const int32_t end) { } Result BitSliceIndexBitmap::Eq(const int32_t code) { - PAIMON_ASSIGN_OR_RAISE(const auto empty_bitmap, GetEmptyBitmap()); - auto equal = RoaringBitmap32(*empty_bitmap); - for (int32_t i = static_cast(bit_slices_.size()) - 1; i >= 0; --i) { - PAIMON_ASSIGN_OR_RAISE(const auto slice_bitmap, GetSliceBitmap(i)); + PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* empty_bitmap, GetEmptyBitmap()); + auto state = RoaringBitmap32(*empty_bitmap); + if (state.IsEmpty()) { + return RoaringBitmap32(); + } + PAIMON_RETURN_NOT_OK(LoadSlices(0, static_cast(bit_slices_.size()))); + for (int32_t i = 0; i < static_cast(bit_slices_.size()); i++) { + PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* slice_bitmap, GetSliceBitmap(i)); if ((code >> i & 1) == 1) { - equal &= *slice_bitmap; + state &= *slice_bitmap; } else { - equal -= *slice_bitmap; + state -= *slice_bitmap; } } - return equal; + return state; } Result BitSliceIndexBitmap::Gt(const int32_t code) { if (code < 0) { return IsNotNull({}); } - PAIMON_ASSIGN_OR_RAISE(const auto found_set, IsNotNull({})); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 found_set, IsNotNull({})); if (found_set.IsEmpty()) { return RoaringBitmap32(); } @@ -175,13 +179,13 @@ Result BitSliceIndexBitmap::Gt(const int32_t code) { PAIMON_RETURN_NOT_OK(LoadSlices(start, static_cast(bit_slices_.size()))); for (int i = start; i < static_cast(bit_slices_.size()); ++i) { if (!state_inited) { - PAIMON_ASSIGN_OR_RAISE(const auto slice_ptr, GetSliceBitmap(i)); + PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* slice_ptr, GetSliceBitmap(i)); state = *slice_ptr; state_inited = true; continue; } const auto bit = code >> i & 1; - PAIMON_ASSIGN_OR_RAISE(const auto slice_ptr, GetSliceBitmap(i)); + PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* slice_ptr, GetSliceBitmap(i)); if (bit == 1) { state &= *slice_ptr; } else { @@ -207,7 +211,7 @@ Result BitSliceIndexBitmap::IsNotNull(const RoaringBitmap32& fo PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data(), ebm_length_)); ebm = bitmap; } - return found_set.IsEmpty() ? ebm.value() : RoaringBitmap32::And(ebm.value(), found_set); + return found_set.IsEmpty() ? *ebm : *ebm &= found_set; } BitSliceIndexBitmap::Appender::Appender(const std::shared_ptr& pool, const int32_t min, @@ -245,7 +249,7 @@ Result> BitSliceIndexBitmap::Appender::Serialize() cons header_size += sizeof(int32_t); // indexes length header_size += indexes_length; int32_t offset = 0; - const auto data_output_stream = std::make_unique( + auto data_output_stream = std::make_unique( MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); auto slices_bytes_vector = std::vector>{}; auto indexes_vector = std::vector>{}; diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h index e14679b5..0ef4c5e1 100644 --- a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h b/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h index c3d61803..055232ca 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +33,7 @@ class Chunk { virtual Result TryAdd(const Literal& key) = 0; virtual Result Find(const Literal& key) { - PAIMON_ASSIGN_OR_RAISE(const auto cmp_with_key, Key().CompareTo(key)); + PAIMON_ASSIGN_OR_RAISE(int32_t cmp_with_key, Key().CompareTo(key)); if (cmp_with_key == 0) { return Code(); } @@ -42,8 +42,8 @@ class Chunk { const int32_t base = Code() + 1; while (low <= high) { const int32_t mid = low + (high - low) / 2; - PAIMON_ASSIGN_OR_RAISE(auto key_at_mid, GetKey(mid)); - PAIMON_ASSIGN_OR_RAISE(const auto cmp, key_at_mid.CompareTo(key)); + PAIMON_ASSIGN_OR_RAISE(Literal key_at_mid, GetKey(mid)); + PAIMON_ASSIGN_OR_RAISE(int32_t cmp, key_at_mid.CompareTo(key)); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp index c881218b..5439a7cc 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ Result ChunkedDictionary::Find(const Literal& key) { int32_t high = size_ - 1; while (low <= high) { const int32_t mid = low + (high - low) / 2; - PAIMON_ASSIGN_OR_RAISE(const auto chunk, GetChunk(mid)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr chunk, GetChunk(mid)); PAIMON_ASSIGN_OR_RAISE(const int32_t result, chunk->Key().CompareTo(key)); if (result > 0) { high = mid - 1; @@ -51,20 +51,20 @@ Result ChunkedDictionary::Find(const Literal& key) { if (low == 0) { return -(low + 1); } - PAIMON_ASSIGN_OR_RAISE(const auto prev_chunk, GetChunk(low - 1)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr prev_chunk, GetChunk(low - 1)); return prev_chunk->Find(key); } Result ChunkedDictionary::Find(const int32_t code) { if (code < 0) { - return Status::Invalid("Invalid code: " + std::to_string(code)); + return Status::Invalid(fmt::format("Invalid code: {}", code)); } int32_t low = 0; int32_t high = size_ - 1; while (low <= high) { const int32_t mid = low + (high - low) / 2; - PAIMON_ASSIGN_OR_RAISE(const auto chunk, GetChunk(mid)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr chunk, GetChunk(mid)); auto const chunk_code = chunk->Code(); if (chunk_code > code) { @@ -75,7 +75,7 @@ Result ChunkedDictionary::Find(const int32_t code) { return {chunk->Key()}; } } - PAIMON_ASSIGN_OR_RAISE(const auto prev_chunk, GetChunk(low - 1)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr prev_chunk, GetChunk(low - 1)); return prev_chunk->Find(code); } @@ -96,12 +96,12 @@ Result> ChunkedDictionary::GetChunk(int32_t index) { if (chunks_cache_[index]) { return chunks_cache_[index]; } - const auto data_in = std::make_unique( + auto data_in = std::make_unique( std::make_shared(offsets_bytes_->data(), offsets_length_)); PAIMON_RETURN_NOT_OK(data_in->Seek(sizeof(int32_t) * index)); - PAIMON_ASSIGN_OR_RAISE(const auto chunk_offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t chunk_offset, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE( - auto chunk, + std::unique_ptr chunk, factory_->MmapChunk(pool_, input_stream_, body_offset_ + offsets_length_ + chunk_offset, body_offset_ + chunks_length_ + offsets_length_)); chunks_cache_[index] = std::move(chunk); @@ -131,7 +131,7 @@ Status ChunkedDictionary::Appender::AppendSorted(const Literal& key, int32_t cod return Status::Invalid("key should not be null"); } if (last_key_.has_value()) { - PAIMON_ASSIGN_OR_RAISE(const auto compare_result, last_key_->CompareTo(key)); + PAIMON_ASSIGN_OR_RAISE(int32_t compare_result, last_key_->CompareTo(key)); if (compare_result >= 0) { return Status::Invalid("key must be in sorted order"); } @@ -145,8 +145,10 @@ Status ChunkedDictionary::Appender::AppendSorted(const Literal& key, int32_t cod PAIMON_ASSIGN_OR_RAISE(chunk_, key_factory_->CreateChunk(pool_, key, code, chunk_size_bytes_)); } else { - PAIMON_ASSIGN_OR_RAISE(const auto success, chunk_->TryAdd(key)); - if (success) return Status::OK(); + PAIMON_ASSIGN_OR_RAISE(bool success, chunk_->TryAdd(key)); + if (success) { + return Status::OK(); + } PAIMON_RETURN_NOT_OK(Flush()); PAIMON_ASSIGN_OR_RAISE(chunk_, key_factory_->CreateChunk(pool_, key, code, chunk_size_bytes_)); @@ -163,7 +165,7 @@ Result> ChunkedDictionary::Appender::Serialize() { header_size += sizeof(int32_t); // size header_size += sizeof(int32_t); // offsets length header_size += sizeof(int32_t); // chunks length - const auto data_out = std::make_unique( + auto data_out = std::make_unique( MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); data_out->WriteValue(header_size); data_out->WriteValue(CURRENT_VERSION); @@ -185,8 +187,8 @@ Result> ChunkedDictionary::Appender::Serialize() { Status ChunkedDictionary::Appender::Flush() { chunk_->SetOffset(key_offset_); - PAIMON_ASSIGN_OR_RAISE(const auto chunks_bytes, chunk_->SerializeChunk()); - PAIMON_ASSIGN_OR_RAISE(const auto keys_bytes, chunk_->SerializeKeys()); + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR chunks_bytes, chunk_->SerializeChunk()); + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR keys_bytes, chunk_->SerializeKeys()); offsets_output_->WriteValue(chunks_offset_); chunks_offset_ += static_cast(chunks_bytes->size()); key_offset_ += static_cast(keys_bytes->size()); @@ -200,35 +202,32 @@ Status ChunkedDictionary::Appender::Flush() { Result> ChunkedDictionary::Create( const std::shared_ptr& pool, const FieldType field_type, const std::shared_ptr& input_stream, const int64_t offset) { - const auto data_in = std::make_unique(input_stream); + auto data_in = std::make_unique(input_stream); PAIMON_RETURN_NOT_OK(data_in->Seek(offset)); - PAIMON_ASSIGN_OR_RAISE(const auto header_length, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto version, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); if (version != CURRENT_VERSION) { return Status::Invalid("Unknown version of ChunkedDictionary"); } - PAIMON_ASSIGN_OR_RAISE(const auto size, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto offsets_length, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto chunks_length, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(auto factory, KeyFactory::Create(field_type)); - const auto factory_shared = std::shared_ptr{std::move(factory)}; - auto result = std::make_unique( - pool, input_stream, offset, field_type, factory_shared, size, offsets_length, chunks_length, - offset + header_length + sizeof(int32_t)); + PAIMON_ASSIGN_OR_RAISE(int32_t size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t offsets_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t chunks_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr factory_shared, + KeyFactory::Create(field_type)); + auto result = std::make_unique(pool, input_stream, factory_shared, size, + offsets_length, chunks_length, + offset + header_length + sizeof(int32_t)); return result; } ChunkedDictionary::ChunkedDictionary(const std::shared_ptr& pool, const std::shared_ptr& input_stream, - const int64_t start_of_dictionary, const FieldType field_type, const std::shared_ptr& factory, const int32_t size, const int32_t offsets_length, const int32_t chunks_length, const int64_t body_offset) : pool_(pool), - field_type_(field_type), factory_(factory), input_stream_(input_stream), - start_of_dictionary_(start_of_dictionary), size_(size), offsets_length_(offsets_length), chunks_length_(chunks_length), diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h index bfdc0435..f42043da 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,17 +69,16 @@ class ChunkedDictionary final : public Dictionary { const std::shared_ptr& pool, FieldType field_type, const std::shared_ptr& input_stream, int64_t offset); - explicit ChunkedDictionary(const std::shared_ptr& pool, - const std::shared_ptr& input_stream, - int64_t start_of_dictionary, FieldType field_type, - const std::shared_ptr& factory, int32_t size, - int32_t offsets_length, int32_t chunks_length, int64_t body_offset); + ChunkedDictionary(const std::shared_ptr& pool, + const std::shared_ptr& input_stream, + const std::shared_ptr& factory, int32_t size, + int32_t offsets_length, int32_t chunks_length, int64_t body_offset); + + private: std::shared_ptr pool_; - FieldType field_type_; std::shared_ptr factory_; std::shared_ptr input_stream_; - int64_t start_of_dictionary_; int32_t size_; // number of chunks int32_t offsets_length_; // bytes length of offsets int32_t chunks_length_; // bytes length of chunks diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h b/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h index fb47a0f6..5f9a732b 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h +++ b/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp index 570b4c58..cb774771 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp +++ b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include "fmt/format.h" #include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" #include "paimon/common/io/memory_segment_output_stream.h" #include "paimon/common/memory/memory_segment_utils.h" #include "paimon/common/utils/field_type_utils.h" @@ -35,12 +36,13 @@ Result FixedLengthChunk::TryAdd(const Literal& key) { if (keys_stream_out_ == nullptr) { keys_stream_out_ = std::make_shared( MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); - PAIMON_ASSIGN_OR_RAISE(serializer_, factory_->CreateSerializer()); + PAIMON_ASSIGN_OR_RAISE(serializer_, + LiteralSerDeUtils::CreateValueWriter(factory_->GetFieldType())); } if (fixed_length_ > remaining_keys_size_) { return false; } - PAIMON_RETURN_NOT_OK((*serializer_)(keys_stream_out_, key)); + PAIMON_RETURN_NOT_OK(serializer_(keys_stream_out_, key)); remaining_keys_size_ -= fixed_length_; size_ += 1; return true; @@ -54,19 +56,20 @@ Result FixedLengthChunk::GetKey(const int32_t index) { PAIMON_RETURN_NOT_OK(input_stream_->Seek(keys_base_offset_ + offset_, FS_SEEK_SET)); keys_ = Bytes::AllocateBytes(keys_length_, pool_.get()); PAIMON_RETURN_NOT_OK(input_stream_->Read(keys_->data(), keys_length_)); - PAIMON_ASSIGN_OR_RAISE(deserializer_, factory_->CreateDeserializer()); + PAIMON_ASSIGN_OR_RAISE(deserializer_, + LiteralSerDeUtils::CreateValueReader(factory_->GetFieldType())); keys_stream_in_ = std::make_shared( std::make_shared(keys_->data(), keys_length_)); } PAIMON_RETURN_NOT_OK(keys_stream_in_->Seek(index * fixed_length_)); - return (*deserializer_)(keys_stream_in_, pool_.get()); + return deserializer_(keys_stream_in_, pool_.get()); } Result> FixedLengthChunk::SerializeChunk() const { const auto data_out = std::make_shared( MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); data_out->WriteValue(CURRENT_VERSION); - PAIMON_RETURN_NOT_OK((*serializer_)(data_out, key_)); + PAIMON_RETURN_NOT_OK(serializer_(data_out, key_)); data_out->WriteValue(code_); data_out->WriteValue(offset_); data_out->WriteValue(size_); @@ -99,7 +102,7 @@ FixedLengthChunk::FixedLengthChunk(const std::shared_ptr& pool, Lite keys_base_offset_(keys_base_offset), keys_length_(keys_length), fixed_length_(fixed_length), - deserializer_({std::nullopt}), + deserializer_({}), keys_stream_in_(nullptr), keys_(nullptr), remaining_keys_size_(0) {} @@ -119,7 +122,7 @@ FixedLengthChunk::FixedLengthChunk(const std::shared_ptr& pool, Lite keys_base_offset_(0), keys_length_(0), fixed_length_(fixed_length), - deserializer_({std::nullopt}), + deserializer_({}), keys_stream_in_(nullptr), keys_(nullptr), remaining_keys_size_(keys_length_limit) {} diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h index d1f75c72..10f90db6 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h +++ b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,13 +22,14 @@ #include "paimon/common/file_index/rangebitmap/dictionary/chunk.h" #include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" #include "paimon/fs/file_system.h" #include "paimon/predicate/literal.h" #include "paimon/result.h" #include "paimon/status.h" namespace paimon { - +class DataInputStream; class InputStream; class MemoryPool; @@ -67,24 +68,25 @@ class FixedLengthChunk final : public Chunk { int32_t keys_length_limit, const std::shared_ptr& factory, int32_t fixed_length); + private: std::shared_ptr pool_; - Literal key_; // representative key for binary search - int32_t code_; // first code in this chunk - int32_t offset_; // offset of this chunk - int32_t size_; // number of keys in this chunk - std::shared_ptr factory_; // factory for serialization/deserialization + Literal key_; // representative key for binary search + int32_t code_; // first code in this chunk + int32_t offset_; // offset of this chunk + int32_t size_; // number of keys in this chunk + std::shared_ptr factory_; // For read path lazy keys loading std::shared_ptr input_stream_; int32_t keys_base_offset_; int32_t keys_length_; int32_t fixed_length_; - std::optional deserializer_; + LiteralSerDeUtils::Deserializer deserializer_; std::shared_ptr keys_stream_in_; PAIMON_UNIQUE_PTR keys_; // For write path - std::optional serializer_; + LiteralSerDeUtils::Serializer serializer_; std::shared_ptr keys_stream_out_; int64_t remaining_keys_size_; }; diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp index d79b1b2b..c49a6f12 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp +++ b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,36 +24,31 @@ namespace paimon { -Result> KeyFactory::Create(const FieldType field_type) { +Result> KeyFactory::Create(const FieldType field_type) { // todo: support timestamp switch (field_type) { case FieldType::BOOLEAN: - return std::make_unique(); + return std::make_shared(); case FieldType::TINYINT: - return std::make_unique(); + return std::make_shared(); case FieldType::SMALLINT: - return std::make_unique(); + return std::make_shared(); case FieldType::DATE: - return std::make_unique(); + return std::make_shared(); case FieldType::INT: - return std::make_unique(); + return std::make_shared(); case FieldType::BIGINT: - return std::make_unique(); + return std::make_shared(); case FieldType::FLOAT: - return std::make_unique(); + return std::make_shared(); case FieldType::DOUBLE: - return std::make_unique(); + return std::make_shared(); default: return Status::Invalid(fmt::format("Unsupported field type for KeyFactory: {}", FieldTypeUtils::FieldTypeToString(field_type))); } } -const std::string& KeyFactory::GetDefaultChunkSize() { - static const std::string kDefaultChunkSize = "16kb"; - return kDefaultChunkSize; -} - Result> FixedLengthKeyFactory::CreateChunk( const std::shared_ptr& pool, const Literal& key, const int32_t code, const int32_t keys_length_limit) { @@ -65,41 +60,24 @@ Result> FixedLengthKeyFactory::MmapChunk( const std::shared_ptr& pool, const std::shared_ptr& input_stream, const int32_t chunk_offest, const int32_t keys_base_offset) { PAIMON_RETURN_NOT_OK(input_stream->Seek(chunk_offest, FS_SEEK_SET)); - PAIMON_ASSIGN_OR_RAISE(const auto deserializer, this->CreateDeserializer()); + PAIMON_ASSIGN_OR_RAISE(LiteralSerDeUtils::Deserializer deserializer, + LiteralSerDeUtils::CreateValueReader(GetFieldType())); const auto data_in = std::make_shared(input_stream); - PAIMON_ASSIGN_OR_RAISE(const auto version, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); if (version != ChunkedDictionary::CURRENT_VERSION) { return Status::Invalid(fmt::format("Unsupported version for KeyFactory: {}", version)); } - PAIMON_ASSIGN_OR_RAISE(const auto key_literal, deserializer(data_in, pool.get())); - PAIMON_ASSIGN_OR_RAISE(const auto code, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto offset, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto size, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto keys_length, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto fixed_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(Literal key_literal, deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(int32_t code, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t keys_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t fixed_length, data_in->ReadValue()); return std::make_unique(pool, key_literal, code, offset, size, this->shared_from_this(), input_stream, keys_base_offset, keys_length, fixed_length); } -Result FixedLengthKeyFactory::CreateSerializer() { - return KeySerializer([this](const std::shared_ptr& out, - const Literal& literal) -> Status { - PAIMON_ASSIGN_OR_RAISE(const auto writer, - LiteralSerializationUtils::CreateValueWriter(GetFieldType(), out)); - return writer(literal); - }); -} - -Result FixedLengthKeyFactory::CreateDeserializer() { - return KeyDeserializer([this](const std::shared_ptr& in, - MemoryPool* pool) -> Result { - PAIMON_ASSIGN_OR_RAISE( - auto reader, LiteralSerializationUtils::CreateValueReader(GetFieldType(), in, pool)); - return reader(); - }); -} - Result> VariableLengthKeyFactory::CreateChunk( const std::shared_ptr& pool, const Literal& key, int32_t code, int32_t keys_length_limit) { @@ -111,12 +89,4 @@ Result> VariableLengthKeyFactory::MmapChunk( return Status::NotImplemented("VariableLengthKeyFactory::MmapChunk not implemented"); } -Result VariableLengthKeyFactory::CreateSerializer() { - return Status::NotImplemented("VariableLengthKeyFactory::CreateSerializer not implemented"); -} - -Result VariableLengthKeyFactory::CreateDeserializer() { - return Status::NotImplemented("VariableLengthKeyFactory::CreateDeserializer not implemented"); -} - } // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h index b7df1bed..079c19f4 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h +++ b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,16 +16,13 @@ #pragma once -#include #include #include "paimon/common/file_index/rangebitmap/dictionary/chunk.h" #include "paimon/common/io/memory_segment_output_stream.h" #include "paimon/defs.h" -#include "paimon/io/data_input_stream.h" #include "paimon/predicate/literal.h" #include "paimon/result.h" -#include "paimon/status.h" namespace paimon { @@ -38,11 +35,6 @@ class KeyFactory : public std::enable_shared_from_this { virtual FieldType GetFieldType() const = 0; - using KeySerializer = - std::function&, const Literal&)>; - using KeyDeserializer = - std::function(const std::shared_ptr&, MemoryPool*)>; - /// For writing new chunk virtual Result> CreateChunk(const std::shared_ptr& pool, const Literal& key, int32_t code, @@ -53,13 +45,7 @@ class KeyFactory : public std::enable_shared_from_this { const std::shared_ptr& pool, const std::shared_ptr& input_stream, int32_t chunk_offest, int32_t keys_base_offset) = 0; - virtual Result CreateSerializer() = 0; - - virtual Result CreateDeserializer() = 0; - - static Result> Create(FieldType field_type); - - static const std::string& GetDefaultChunkSize(); + static Result> Create(FieldType field_type); }; class FixedLengthKeyFactory : public KeyFactory { @@ -71,8 +57,6 @@ class FixedLengthKeyFactory : public KeyFactory { const std::shared_ptr& input_stream, int32_t chunk_offest, int32_t keys_base_offset) override; - Result CreateSerializer() override; - Result CreateDeserializer() override; virtual size_t GetFieldSize() const = 0; }; @@ -85,8 +69,6 @@ class VariableLengthKeyFactory : public KeyFactory { const std::shared_ptr& input_stream, int32_t chunk_offest, int32_t keys_base_offset) override; - Result CreateSerializer() override; - Result CreateDeserializer() override; }; class DateKeyFactory final : public FixedLengthKeyFactory { diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp index 423c85e0..761b19f0 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,30 +36,31 @@ Result> RangeBitmap::Create( const FieldType field_type, const std::shared_ptr& pool) { PAIMON_RETURN_NOT_OK(input_stream->Seek(offset, SeekOrigin::FS_SEEK_SET)); const auto data_in = std::make_shared(input_stream); - PAIMON_ASSIGN_OR_RAISE(const auto header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); if (version != VERSION) { return Status::Invalid( fmt::format("RangeBitmap unsupported version {} (expected {})", version, VERSION)); } - PAIMON_ASSIGN_OR_RAISE(const auto rid, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(const auto cardinality, data_in->ReadValue()); - PAIMON_ASSIGN_OR_RAISE(auto key_factory, KeyFactory::Create(field_type)); - const auto shared_key_factory = std::shared_ptr{std::move(key_factory)}; - PAIMON_ASSIGN_OR_RAISE(const auto key_deserializer, shared_key_factory->CreateDeserializer()); - PAIMON_ASSIGN_OR_RAISE(auto min, key_deserializer(data_in, pool.get())); - PAIMON_ASSIGN_OR_RAISE(auto max, key_deserializer(data_in, pool.get())); - PAIMON_ASSIGN_OR_RAISE(const auto dictionary_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t rid, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t cardinality, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr shared_key_factory, + KeyFactory::Create(field_type)); + PAIMON_ASSIGN_OR_RAISE(LiteralSerDeUtils::Deserializer key_deserializer, + LiteralSerDeUtils::CreateValueReader(field_type)); + PAIMON_ASSIGN_OR_RAISE(Literal min, key_deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(Literal max, key_deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(int32_t dictionary_length, data_in->ReadValue()); const auto dictionary_offset = static_cast(offset + sizeof(int32_t) + header_length); const auto bsi_offset = dictionary_offset + dictionary_length; return std::unique_ptr(new RangeBitmap(pool, rid, cardinality, dictionary_offset, - bsi_offset, std::move(min), std::move(max), - shared_key_factory, input_stream)); + bsi_offset, min, max, shared_key_factory, + input_stream)); } Result RangeBitmap::Not(RoaringBitmap32& bitmap) { bitmap.Flip(0, rid_); - PAIMON_ASSIGN_OR_RAISE(const auto is_not_null, this->IsNotNull()); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 is_not_null, this->IsNotNull()); return bitmap &= is_not_null; } @@ -67,17 +68,17 @@ Result RangeBitmap::Eq(const Literal& key) { if (cardinality_ <= 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(const auto min_compare, key.CompareTo(min_)); - PAIMON_ASSIGN_OR_RAISE(const auto max_compare, key.CompareTo(max_)); - PAIMON_ASSIGN_OR_RAISE(const auto bit_slice_ptr, this->GetBitSliceIndex()); + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); + PAIMON_ASSIGN_OR_RAISE(BitSliceIndexBitmap* const bit_slice_ptr, this->GetBitSliceIndex()); if (min_compare == 0 && max_compare == 0) { return bit_slice_ptr->IsNotNull({}); } if (min_compare < 0 || max_compare > 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(const auto dictionary, this->GetDictionary()); - PAIMON_ASSIGN_OR_RAISE(const auto code, dictionary->Find(key)); + PAIMON_ASSIGN_OR_RAISE(Dictionary* const dictionary, this->GetDictionary()); + PAIMON_ASSIGN_OR_RAISE(int32_t code, dictionary->Find(key)); if (code < 0) { return RoaringBitmap32(); } @@ -88,7 +89,7 @@ Result RangeBitmap::Neq(const Literal& key) { if (cardinality_ <= 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(auto eq_result, Eq(key)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 eq_result, Eq(key)); return Not(eq_result); } @@ -96,15 +97,15 @@ Result RangeBitmap::Lt(const Literal& key) { if (cardinality_ <= 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(const auto min_compare, key.CompareTo(min_)); - PAIMON_ASSIGN_OR_RAISE(const auto max_compare, key.CompareTo(max_)); + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); if (max_compare > 0) { return IsNotNull(); } if (min_compare <= 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(auto gte_result, Gte(key)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 gte_result, Gte(key)); return Not(gte_result); } @@ -112,15 +113,15 @@ Result RangeBitmap::Lte(const Literal& key) { if (cardinality_ <= 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(const auto min_compare, key.CompareTo(min_)); - PAIMON_ASSIGN_OR_RAISE(const auto max_compare, key.CompareTo(max_)); + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); if (max_compare >= 0) { return IsNotNull(); } if (min_compare < 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(auto gt_result, Gt(key)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 gt_result, Gt(key)); return Not(gt_result); } @@ -128,17 +129,17 @@ Result RangeBitmap::Gt(const Literal& key) { if (cardinality_ <= 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(const auto max_compare, key.CompareTo(max_)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); if (max_compare >= 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(const auto min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); if (min_compare < 0) { return IsNotNull(); } - PAIMON_ASSIGN_OR_RAISE(const auto dictionary, this->GetDictionary()); - PAIMON_ASSIGN_OR_RAISE(const auto code, dictionary->Find(key)); - PAIMON_ASSIGN_OR_RAISE(const auto bit_slice_ptr, this->GetBitSliceIndex()); + PAIMON_ASSIGN_OR_RAISE(Dictionary* const dictionary, this->GetDictionary()); + PAIMON_ASSIGN_OR_RAISE(int32_t code, dictionary->Find(key)); + PAIMON_ASSIGN_OR_RAISE(BitSliceIndexBitmap* const bit_slice_ptr, this->GetBitSliceIndex()); if (code >= 0) { return bit_slice_ptr->Gt(code); } @@ -146,8 +147,8 @@ Result RangeBitmap::Gt(const Literal& key) { } Result RangeBitmap::Gte(const Literal& key) { - PAIMON_ASSIGN_OR_RAISE(auto gt_result, Gt(key)); - PAIMON_ASSIGN_OR_RAISE(const auto eq_result, Eq(key)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 gt_result, Gt(key)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 eq_result, Eq(key)); gt_result |= eq_result; return gt_result; } @@ -158,7 +159,7 @@ Result RangeBitmap::In(const std::vector& keys) { } RoaringBitmap32 result{}; for (const auto& key : keys) { - PAIMON_ASSIGN_OR_RAISE(const auto bitmap, Eq(key)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 bitmap, Eq(key)); result |= bitmap; } return result; @@ -168,7 +169,7 @@ Result RangeBitmap::NotIn(const std::vector& keys) { if (cardinality_ <= 0) { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(auto in_result, In(keys)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 in_result, In(keys)); return Not(in_result); } @@ -182,7 +183,7 @@ Result RangeBitmap::IsNull() { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(auto non_null_bitmap, IsNotNull()); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 non_null_bitmap, IsNotNull()); non_null_bitmap.Flip(0, rid_); return non_null_bitmap; } @@ -192,14 +193,14 @@ Result RangeBitmap::IsNotNull() { return RoaringBitmap32(); } - PAIMON_ASSIGN_OR_RAISE(const auto bit_slice_ptr, this->GetBitSliceIndex()); - PAIMON_ASSIGN_OR_RAISE(auto result, bit_slice_ptr->IsNotNull({})); + PAIMON_ASSIGN_OR_RAISE(BitSliceIndexBitmap* const bit_slice_ptr, this->GetBitSliceIndex()); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 result, bit_slice_ptr->IsNotNull({})); return result; } RangeBitmap::RangeBitmap(const std::shared_ptr& pool, const int32_t rid, const int32_t cardinality, const int32_t dictionary_offset, - const int32_t bsi_offset, Literal&& min, Literal&& max, + const int32_t bsi_offset, const Literal& min, const Literal& max, const std::shared_ptr& key_factory, const std::shared_ptr& input_stream) : pool_(pool), @@ -207,8 +208,8 @@ RangeBitmap::RangeBitmap(const std::shared_ptr& pool, const int32_t cardinality_(cardinality), bsi_offset_(bsi_offset), dictionary_offset_(dictionary_offset), - min_(std::move(min)), - max_(std::move(max)), + min_(min), + max_(max), key_factory_(key_factory), input_stream_(input_stream), bsi_(nullptr), @@ -258,17 +259,16 @@ Result> RangeBitmap::Appender::Serialize() const { } code++; } - PAIMON_ASSIGN_OR_RAISE(const auto serializer, factory_->CreateSerializer()); + PAIMON_ASSIGN_OR_RAISE(LiteralSerDeUtils::Serializer serializer, + LiteralSerDeUtils::CreateValueWriter(factory_->GetFieldType())); auto min = Literal{factory_->GetFieldType()}; auto max = Literal{factory_->GetFieldType()}; if (!bitmaps_.empty()) { min = bitmaps_.begin()->first; max = bitmaps_.rbegin()->first; } - PAIMON_ASSIGN_OR_RAISE(const auto min_size, - LiteralSerializationUtils::GetSerializedSizeInBytes(min)); - PAIMON_ASSIGN_OR_RAISE(const auto max_size, - LiteralSerializationUtils::GetSerializedSizeInBytes(max)); + PAIMON_ASSIGN_OR_RAISE(int32_t min_size, LiteralSerDeUtils::GetSerializedSizeInBytes(min)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_size, LiteralSerDeUtils::GetSerializedSizeInBytes(max)); int32_t header_size = 0; header_size += sizeof(int8_t); // version header_size += sizeof(int32_t); // rid @@ -276,9 +276,9 @@ Result> RangeBitmap::Appender::Serialize() const { header_size += min.IsNull() ? 0 : min_size; // min literal size header_size += max.IsNull() ? 0 : max_size; // max literal size header_size += sizeof(int32_t); // dictionary length - PAIMON_ASSIGN_OR_RAISE(const auto dictionary_bytes, dictionary.Serialize()); + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR dictionary_bytes, dictionary.Serialize()); const auto dictionary_length = static_cast(dictionary_bytes->size()); - PAIMON_ASSIGN_OR_RAISE(const auto bsi_bytes, bsi.Serialize()); + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR bsi_bytes, bsi.Serialize()); const auto bsi_length = bsi_bytes->size(); const auto data_output_stream = std::make_shared( MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.h b/src/paimon/common/file_index/rangebitmap/range_bitmap.h index 6d61c6e8..17070ef0 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap.h +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,11 +56,13 @@ class RangeBitmap { Result Not(RoaringBitmap32& bitmap); RangeBitmap(const std::shared_ptr& pool, int32_t rid, int32_t cardinality, - int32_t dictionary_offset, int32_t bsi_offset, Literal&& min, Literal&& max, - const std::shared_ptr& key_factory, + int32_t dictionary_offset, int32_t bsi_offset, const Literal& min, + const Literal& max, const std::shared_ptr& key_factory, const std::shared_ptr& input_stream); Result GetBitSliceIndex(); Result GetDictionary(); + + private: std::shared_ptr pool_; int32_t rid_; int32_t cardinality_; diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp index 09cf0f9a..d8136072 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ #include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" -#include #include #include @@ -38,10 +37,10 @@ RangeBitmapFileIndex::RangeBitmapFileIndex(const std::map> RangeBitmapFileIndex::CreateReader( - ArrowSchema* const arrow_schema, const int32_t start, const int32_t length, + ::ArrowSchema* const arrow_schema, const int32_t start, const int32_t length, const std::shared_ptr& input_stream, const std::shared_ptr& pool) const { - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(const auto arrow_schema_ptr, + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_schema_ptr, arrow::ImportSchema(arrow_schema)); if (arrow_schema_ptr->num_fields() != 1) { return Status::Invalid( @@ -52,8 +51,8 @@ Result> RangeBitmapFileIndex::CreateReader( } Result> RangeBitmapFileIndex::CreateWriter( - ArrowSchema* arrow_schema, const std::shared_ptr& pool) const { - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(const auto arrow_schema_ptr, + ::ArrowSchema* arrow_schema, const std::shared_ptr& pool) const { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_schema_ptr, arrow::ImportSchema(arrow_schema)); if (arrow_schema_ptr->num_fields() != 1) { return Status::Invalid( @@ -69,14 +68,14 @@ Result> RangeBitmapFileIndexWriter:: const std::map& options, const std::shared_ptr& pool) { const auto field = arrow_schema->GetFieldByName(field_name); if (!field) { - return Status::Invalid("Field not found in schema: " + field_name); + return Status::Invalid(fmt::format("Field not found in schema: {}", field_name)); } - PAIMON_ASSIGN_OR_RAISE(auto field_type, + PAIMON_ASSIGN_OR_RAISE(FieldType field_type, FieldTypeUtils::ConvertToFieldType(field->type()->id())); - PAIMON_ASSIGN_OR_RAISE(auto key_factory, KeyFactory::Create(field_type)); - const auto shared_key_factory = std::shared_ptr{std::move(key_factory)}; - const auto& chunk_size = KeyFactory::GetDefaultChunkSize(); - PAIMON_ASSIGN_OR_RAISE(auto parsed_chunk_size, MemorySize::ParseBytes(chunk_size)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr shared_key_factory, + KeyFactory::Create(field_type)); + PAIMON_ASSIGN_OR_RAISE(int64_t parsed_chunk_size, + MemorySize::ParseBytes(RangeBitmapFileIndexWriter::DEFAULT_CHUNK_SIZE)); if (const auto chunk_size_it = options.find(RangeBitmapFileIndex::CHUNK_SIZE); chunk_size_it != options.end()) { PAIMON_ASSIGN_OR_RAISE(parsed_chunk_size, MemorySize::ParseBytes(chunk_size_it->second)); @@ -86,14 +85,14 @@ Result> RangeBitmapFileIndexWriter:: } auto appender_ptr = std::make_unique(pool, shared_key_factory, parsed_chunk_size); - return std::make_shared(field->type(), field_type, options, pool, - parsed_chunk_size, shared_key_factory, - std::move(appender_ptr)); + return std::make_shared( + field->type(), options, pool, shared_key_factory, std::move(appender_ptr)); } Status RangeBitmapFileIndexWriter::AddBatch(::ArrowArray* batch) { - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(const auto array, arrow::ImportArray(batch, arrow_type_)); - PAIMON_ASSIGN_OR_RAISE(const auto array_values, + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr array, + arrow::ImportArray(batch, arrow_type_)); + PAIMON_ASSIGN_OR_RAISE(std::vector array_values, LiteralConverter::ConvertLiteralsFromArray(*array, true)); for (const auto& literal : array_values) { appender_->Append(literal); @@ -106,16 +105,13 @@ Result> RangeBitmapFileIndexWriter::SerializedBytes() c } RangeBitmapFileIndexWriter::RangeBitmapFileIndexWriter( - const std::shared_ptr& arrow_type, const FieldType field_type, + const std::shared_ptr& arrow_type, const std::map& options, const std::shared_ptr& pool, - const int64_t chunk_size, const std::shared_ptr& key_factory, - std::unique_ptr appender) + const std::shared_ptr& key_factory, std::unique_ptr appender) : arrow_type_(arrow_type), - field_type_(field_type), options_(options), pool_(pool), key_factory_(key_factory), - chunk_size_(chunk_size), appender_(std::move(appender)) {} Result> RangeBitmapFileIndexReader::Create( @@ -124,9 +120,9 @@ Result> RangeBitmapFileIndexReader:: if (!arrow_type || !input_stream || !pool) { return Status::Invalid("RangeBitmapFileIndexReader::Create: null argument"); } - PAIMON_ASSIGN_OR_RAISE(const FieldType field_type, + PAIMON_ASSIGN_OR_RAISE(FieldType field_type, FieldTypeUtils::ConvertToFieldType(arrow_type->id())); - PAIMON_ASSIGN_OR_RAISE(auto range_bitmap, + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr range_bitmap, RangeBitmap::Create(input_stream, start, field_type, pool)); return std::shared_ptr( new RangeBitmapFileIndexReader(std::move(range_bitmap))); diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h index 21c8d412..e0b6561a 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,12 +41,12 @@ class RangeBitmapFileIndex final : public FileIndexer { ~RangeBitmapFileIndex() override = default; Result> CreateReader( - ArrowSchema* arrow_schema, int32_t start, int32_t length, + ::ArrowSchema* arrow_schema, int32_t start, int32_t length, const std::shared_ptr& input_stream, const std::shared_ptr& pool) const override; Result> CreateWriter( - ArrowSchema* arrow_schema, const std::shared_ptr& pool) const override; + ::ArrowSchema* arrow_schema, const std::shared_ptr& pool) const override; static constexpr char CHUNK_SIZE[] = "chunk-size"; @@ -56,25 +56,26 @@ class RangeBitmapFileIndex final : public FileIndexer { class RangeBitmapFileIndexWriter final : public FileIndexWriter { public: + static constexpr char DEFAULT_CHUNK_SIZE[] = "16kb"; + static Result> Create( const std::shared_ptr& arrow_schema, const std::string& field_name, const std::map& options, const std::shared_ptr& pool); - Status AddBatch(ArrowArray* batch) override; + Status AddBatch(::ArrowArray* batch) override; Result> SerializedBytes() const override; RangeBitmapFileIndexWriter(const std::shared_ptr& arrow_type, - FieldType field_type, const std::map& options, - const std::shared_ptr& pool, int64_t chunk_size, + const std::shared_ptr& pool, const std::shared_ptr& key_factory, std::unique_ptr appender); + + private: std::shared_ptr arrow_type_; - FieldType field_type_; std::map options_; std::shared_ptr pool_; std::shared_ptr key_factory_; - int64_t chunk_size_; std::unique_ptr appender_; }; diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp index 6a10bc24..c27736fd 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h index 402a927e..844bd5d2 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp index d84ebd8f..8aa81dae 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,8 +71,7 @@ template Result> CreateReaderForTest( RangeBitmapFileIndexTest* test, const std::shared_ptr& arrow_type, const std::vector& test_data, PAIMON_UNIQUE_PTR* serialized_bytes_out) { - static const std::map kEmptyOptions; - return CreateReaderForTest(test, arrow_type, test_data, kEmptyOptions, + return CreateReaderForTest(test, arrow_type, test_data, {}, serialized_bytes_out); } @@ -86,34 +85,35 @@ Result> CreateReaderForTest( auto builder = std::make_shared(); auto status = builder->AppendValues(test_data); if (!status.ok()) { - return Status::Invalid("Failed to append values: " + status.ToString()); + return Status::Invalid(fmt::format("Failed to append values: {}", status.ToString())); } std::shared_ptr arrow_array; status = builder->Finish(&arrow_array); if (!status.ok()) { - return Status::Invalid("Failed to finish builder: " + status.ToString()); + return Status::Invalid(fmt::format("Failed to finish builder: {}", status.ToString())); } - const auto c_array = std::make_unique<::ArrowArray>(); + auto c_array = std::make_unique<::ArrowArray>(); status = arrow::ExportArray(*arrow_array, c_array.get()); if (!status.ok()) { - return Status::Invalid("Failed to export array: " + status.ToString()); + return Status::Invalid(fmt::format("Failed to export array: {}", status.ToString())); } // Create schema for the field const auto schema = arrow::schema({arrow::field("test_field", arrow_type)}); // Create writer - PAIMON_ASSIGN_OR_RAISE(const auto writer, RangeBitmapFileIndexWriter::Create( - schema, "test_field", options, test->pool_)); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr writer, + RangeBitmapFileIndexWriter::Create(schema, "test_field", options, test->pool_)); // Add the batch PAIMON_RETURN_NOT_OK(writer->AddBatch(c_array.get())); // Get serialized payload - PAIMON_ASSIGN_OR_RAISE(auto serialized_bytes, writer->SerializedBytes()); + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR serialized_bytes, writer->SerializedBytes()); if (!serialized_bytes || serialized_bytes->size() == 0) { return Status::Invalid("Serialized bytes is empty"); } *serialized_bytes_out = std::move(serialized_bytes); const auto input_stream = std::make_shared( (*serialized_bytes_out)->data(), (*serialized_bytes_out)->size()); - PAIMON_ASSIGN_OR_RAISE(auto reader, + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr reader, RangeBitmapFileIndexReader::Create( arrow_type, 0, static_cast((*serialized_bytes_out)->size()), input_stream, test->pool_)); @@ -132,8 +132,8 @@ TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexMultiChunk) { const auto& arrow_type = arrow::int32(); std::map options; // Configure a very small chunk size in bytes so that the dictionary must - // spill into multiple chunks. - options[RangeBitmapFileIndex::CHUNK_SIZE] = "99b"; + // be split into multiple chunks. + options[RangeBitmapFileIndex::CHUNK_SIZE] = "86b"; PAIMON_UNIQUE_PTR serialized_bytes; ASSERT_OK_AND_ASSIGN(auto reader, diff --git a/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp index cb8e6bdd..bada5916 100644 --- a/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp +++ b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,55 +22,63 @@ namespace paimon { -Result> LiteralSerializationUtils::CreateValueWriter( - const FieldType field_type, const std::shared_ptr& output_stream) { +Result LiteralSerDeUtils::CreateValueWriter( + const FieldType field_type) { switch (field_type) { case FieldType::BOOLEAN: - return std::function( - [output_stream](const Literal& literal) -> Status { + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { output_stream->WriteValue(literal.GetValue()); return Status::OK(); }); case FieldType::TINYINT: - return std::function( - [output_stream](const Literal& literal) -> Status { + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { output_stream->WriteValue(literal.GetValue()); return Status::OK(); }); case FieldType::SMALLINT: - return std::function( - [output_stream](const Literal& literal) -> Status { + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { output_stream->WriteValue(literal.GetValue()); return Status::OK(); }); case FieldType::DATE: case FieldType::INT: - return std::function( - [output_stream](const Literal& literal) -> Status { + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { output_stream->WriteValue(literal.GetValue()); return Status::OK(); }); case FieldType::BIGINT: - return std::function( - [output_stream](const Literal& literal) -> Status { + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { output_stream->WriteValue(literal.GetValue()); return Status::OK(); }); case FieldType::FLOAT: - return std::function( - [output_stream](const Literal& literal) -> Status { + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { output_stream->WriteValue(literal.GetValue()); return Status::OK(); }); case FieldType::DOUBLE: - return std::function( - [output_stream](const Literal& literal) -> Status { + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { output_stream->WriteValue(literal.GetValue()); return Status::OK(); }); case FieldType::STRING: { - return std::function( - [output_stream](const Literal& literal) -> Status { + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { const auto value = literal.GetValue(); output_stream->WriteValue(static_cast(value.size())); output_stream->Write(value.data(), value.size()); @@ -84,64 +92,80 @@ Result> LiteralSerializationUtils::CreateV } } -Result()>> LiteralSerializationUtils::CreateValueReader( - FieldType field_type, const std::shared_ptr& input_stream, MemoryPool* pool) { +Result LiteralSerDeUtils::CreateValueReader(FieldType field_type) { switch (field_type) { case FieldType::BOOLEAN: { - return std::function()>([input_stream]() -> Result { - PAIMON_ASSIGN_OR_RAISE(bool value, input_stream->ReadValue()); - return Literal(value); - }); + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(bool value, input_stream->ReadValue()); + return Literal(value); + }); } case FieldType::TINYINT: { - return std::function()>([input_stream]() -> Result { - PAIMON_ASSIGN_OR_RAISE(int8_t value, input_stream->ReadValue()); - return Literal(value); - }); + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int8_t value, input_stream->ReadValue()); + return Literal(value); + }); } case FieldType::SMALLINT: { - return std::function()>([input_stream]() -> Result { - PAIMON_ASSIGN_OR_RAISE(int16_t value, input_stream->ReadValue()); - return Literal(value); - }); + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int16_t value, input_stream->ReadValue()); + return Literal(value); + }); } case FieldType::DATE: { - return std::function()>([input_stream]() -> Result { - PAIMON_ASSIGN_OR_RAISE(int32_t value, input_stream->ReadValue()); - return Literal(FieldType::DATE, value); - }); + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int32_t value, input_stream->ReadValue()); + return Literal(FieldType::DATE, value); + }); } case FieldType::INT: { - return std::function()>([input_stream]() -> Result { - PAIMON_ASSIGN_OR_RAISE(int32_t value, input_stream->ReadValue()); - return Literal(value); - }); + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int32_t value, input_stream->ReadValue()); + return Literal(value); + }); } case FieldType::BIGINT: { - return std::function()>([input_stream]() -> Result { - PAIMON_ASSIGN_OR_RAISE(int64_t value, input_stream->ReadValue()); - return Literal(value); - }); + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int64_t value, input_stream->ReadValue()); + return Literal(value); + }); } case FieldType::FLOAT: { - return std::function()>([input_stream]() -> Result { - PAIMON_ASSIGN_OR_RAISE(float value, input_stream->ReadValue()); - return Literal(value); - }); + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(float value, input_stream->ReadValue()); + return Literal(value); + }); } case FieldType::DOUBLE: { - return std::function()>([input_stream]() -> Result { - PAIMON_ASSIGN_OR_RAISE(double value, input_stream->ReadValue()); - return Literal(value); - }); + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(double value, input_stream->ReadValue()); + return Literal(value); + }); } case FieldType::STRING: { - return std::function()>( - [input_stream, field_type, pool]() -> Result { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { PAIMON_ASSIGN_OR_RAISE(int32_t length, input_stream->ReadValue()); auto bytes = Bytes::AllocateBytes(length, pool); PAIMON_RETURN_NOT_OK(input_stream->ReadBytes(bytes.get())); - return Literal(field_type, bytes->data(), bytes->size()); + return Literal(FieldType::STRING, bytes->data(), bytes->size()); }); } default: @@ -151,7 +175,7 @@ Result()>> LiteralSerializationUtils::CreateValueR } } -Result LiteralSerializationUtils::GetFixedFieldSize(const FieldType& field_type) { +Result LiteralSerDeUtils::GetFixedFieldSize(const FieldType& field_type) { switch (field_type) { case FieldType::BOOLEAN: case FieldType::TINYINT: @@ -172,7 +196,8 @@ Result LiteralSerializationUtils::GetFixedFieldSize(const FieldType& fi FieldTypeUtils::FieldTypeToString(field_type))); } } -Result LiteralSerializationUtils::GetSerializedSizeInBytes(const Literal& literal) { + +Result LiteralSerDeUtils::GetSerializedSizeInBytes(const Literal& literal) { switch (literal.GetType()) { case FieldType::BOOLEAN: case FieldType::TINYINT: diff --git a/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h index 93f247db..f7e7bbb5 100644 --- a/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h +++ b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h @@ -1,5 +1,5 @@ /* - * Copyright 2024-present Alibaba Inc. + * Copyright 2026-present Alibaba Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,18 +27,20 @@ namespace paimon { -class LiteralSerializationUtils { +class LiteralSerDeUtils { public: - LiteralSerializationUtils() = delete; + LiteralSerDeUtils() = delete; - ~LiteralSerializationUtils() = delete; + ~LiteralSerDeUtils() = delete; - static Result()>> CreateValueReader( - FieldType field_type, const std::shared_ptr& input_stream, - MemoryPool* pool = nullptr); + using Serializer = + std::function&, const Literal&)>; + using Deserializer = std::function( + const std::shared_ptr& input_stream, MemoryPool* pool)>; - static Result> CreateValueWriter( - FieldType field_type, const std::shared_ptr& output_stream); + static Result CreateValueReader(FieldType field_type); + + static Result CreateValueWriter(FieldType field_type); static Result GetFixedFieldSize(const FieldType& field_type); From c651cbc8eac2406e6ece699631329d0e857454cf Mon Sep 17 00:00:00 2001 From: xiaoheng Date: Sat, 28 Feb 2026 08:52:53 +0800 Subject: [PATCH 3/6] fix: copyright to 2026, remove unnecessary abstraction of serde in keyfactory --- .../common/file_index/rangebitmap/dictionary/key_factory.h | 3 +++ .../common/file_index/rangebitmap/range_bitmap_file_index.cpp | 2 +- .../common/file_index/rangebitmap/range_bitmap_file_index.h | 3 +-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h index 079c19f4..75e1752f 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h +++ b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h @@ -46,6 +46,9 @@ class KeyFactory : public std::enable_shared_from_this { int32_t chunk_offest, int32_t keys_base_offset) = 0; static Result> Create(FieldType field_type); + + public: + static constexpr char DEFAULT_CHUNK_SIZE[] = "16kb"; }; class FixedLengthKeyFactory : public KeyFactory { diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp index d8136072..2b9f9bfd 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp @@ -75,7 +75,7 @@ Result> RangeBitmapFileIndexWriter:: PAIMON_ASSIGN_OR_RAISE(std::shared_ptr shared_key_factory, KeyFactory::Create(field_type)); PAIMON_ASSIGN_OR_RAISE(int64_t parsed_chunk_size, - MemorySize::ParseBytes(RangeBitmapFileIndexWriter::DEFAULT_CHUNK_SIZE)); + MemorySize::ParseBytes(KeyFactory::DEFAULT_CHUNK_SIZE)); if (const auto chunk_size_it = options.find(RangeBitmapFileIndex::CHUNK_SIZE); chunk_size_it != options.end()) { PAIMON_ASSIGN_OR_RAISE(parsed_chunk_size, MemorySize::ParseBytes(chunk_size_it->second)); diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h index e0b6561a..c1210259 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h @@ -48,6 +48,7 @@ class RangeBitmapFileIndex final : public FileIndexer { Result> CreateWriter( ::ArrowSchema* arrow_schema, const std::shared_ptr& pool) const override; + public: static constexpr char CHUNK_SIZE[] = "chunk-size"; private: @@ -56,8 +57,6 @@ class RangeBitmapFileIndex final : public FileIndexer { class RangeBitmapFileIndexWriter final : public FileIndexWriter { public: - static constexpr char DEFAULT_CHUNK_SIZE[] = "16kb"; - static Result> Create( const std::shared_ptr& arrow_schema, const std::string& field_name, const std::map& options, const std::shared_ptr& pool); From 80b4852ac427bfafd34abe07810d9ae4cf0d002f Mon Sep 17 00:00:00 2001 From: xiaoheng Date: Sat, 28 Feb 2026 09:39:12 +0800 Subject: [PATCH 4/6] fix: code style --- .../rangebitmap/bit_slice_index_bitmap.cpp | 4 ++-- .../rangebitmap/bit_slice_index_bitmap.h | 2 +- .../rangebitmap/dictionary/chunked_dictionary.h | 1 + .../file_index/rangebitmap/range_bitmap.cpp | 16 ++++++++-------- .../common/file_index/rangebitmap/range_bitmap.h | 2 +- .../rangebitmap/range_bitmap_file_index.h | 3 +++ 6 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp index 9309a7e3..d1827185 100644 --- a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp @@ -251,8 +251,8 @@ Result> BitSliceIndexBitmap::Appender::Serialize() cons int32_t offset = 0; auto data_output_stream = std::make_unique( MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); - auto slices_bytes_vector = std::vector>{}; - auto indexes_vector = std::vector>{}; + std::vector> slices_bytes_vector{}; + std::vector> indexes_vector{}; for (const auto& slice : slices_) { auto slice_bytes = slice.Serialize(pool_.get()); const auto length = static_cast(slice_bytes->size()); diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h index 0ef4c5e1..0767b6fc 100644 --- a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h @@ -30,7 +30,7 @@ namespace paimon { class BitSliceIndexBitmap { public: - static constexpr int CURRENT_VERSION = 1; + static constexpr int8_t CURRENT_VERSION = 1; static Result> Create( const std::shared_ptr& pool, const std::shared_ptr& input_stream, int32_t offset); diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h index f42043da..801c3b04 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h @@ -51,6 +51,7 @@ class ChunkedDictionary final : public Dictionary { private: Status Flush(); + private: std::shared_ptr pool_; std::shared_ptr key_factory_; int32_t chunk_size_bytes_; diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp index 761b19f0..b938ebd9 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp @@ -38,9 +38,9 @@ Result> RangeBitmap::Create( const auto data_in = std::make_shared(input_stream); PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); - if (version != VERSION) { - return Status::Invalid( - fmt::format("RangeBitmap unsupported version {} (expected {})", version, VERSION)); + if (version != CURRENT_VERSION) { + return Status::Invalid(fmt::format("RangeBitmap unsupported version {} (expected {})", + version, CURRENT_VERSION)); } PAIMON_ASSIGN_OR_RAISE(int32_t rid, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int32_t cardinality, data_in->ReadValue()); @@ -51,8 +51,8 @@ Result> RangeBitmap::Create( PAIMON_ASSIGN_OR_RAISE(Literal min, key_deserializer(data_in, pool.get())); PAIMON_ASSIGN_OR_RAISE(Literal max, key_deserializer(data_in, pool.get())); PAIMON_ASSIGN_OR_RAISE(int32_t dictionary_length, data_in->ReadValue()); - const auto dictionary_offset = static_cast(offset + sizeof(int32_t) + header_length); - const auto bsi_offset = dictionary_offset + dictionary_length; + int32_t dictionary_offset = static_cast(offset + sizeof(int32_t) + header_length); + int32_t bsi_offset = dictionary_offset + dictionary_length; return std::unique_ptr(new RangeBitmap(pool, rid, cardinality, dictionary_offset, bsi_offset, min, max, shared_key_factory, input_stream)); @@ -277,13 +277,13 @@ Result> RangeBitmap::Appender::Serialize() const { header_size += max.IsNull() ? 0 : max_size; // max literal size header_size += sizeof(int32_t); // dictionary length PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR dictionary_bytes, dictionary.Serialize()); - const auto dictionary_length = static_cast(dictionary_bytes->size()); + auto dictionary_length = static_cast(dictionary_bytes->size()); PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR bsi_bytes, bsi.Serialize()); - const auto bsi_length = bsi_bytes->size(); + size_t bsi_length = bsi_bytes->size(); const auto data_output_stream = std::make_shared( MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); data_output_stream->WriteValue(header_size); - data_output_stream->WriteValue(VERSION); + data_output_stream->WriteValue(CURRENT_VERSION); data_output_stream->WriteValue(rid_); data_output_stream->WriteValue(static_cast(bitmaps_.size())); if (!min.IsNull()) { diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.h b/src/paimon/common/file_index/rangebitmap/range_bitmap.h index 17070ef0..5e93766a 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap.h +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.h @@ -35,7 +35,7 @@ class MemoryPool; class RangeBitmap { public: - static constexpr int8_t VERSION = 1; + static constexpr int8_t CURRENT_VERSION = 1; static Result> Create( const std::shared_ptr& input_stream, int64_t offset, FieldType field_type, diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h index c1210259..2a557f20 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h @@ -70,6 +70,9 @@ class RangeBitmapFileIndexWriter final : public FileIndexWriter { const std::shared_ptr& key_factory, std::unique_ptr appender); + public: + static constexpr char DEFAULT_CHUNK_SIZE[] = "16kb"; + private: std::shared_ptr arrow_type_; std::map options_; From 433af6a12a07c711cb12df7082ced522ae22a4ad Mon Sep 17 00:00:00 2001 From: xiaoheng Date: Sat, 28 Feb 2026 10:05:49 +0800 Subject: [PATCH 5/6] fix: code style --- .../rangebitmap/bit_slice_index_bitmap.cpp | 14 +++++++------- .../rangebitmap/bit_slice_index_bitmap.h | 4 +++- .../common/file_index/rangebitmap/range_bitmap.cpp | 12 ++++++------ .../common/file_index/rangebitmap/range_bitmap.h | 5 +++-- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp index d1827185..3da6be75 100644 --- a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp @@ -37,7 +37,7 @@ Result> BitSliceIndexBitmap::Create( PAIMON_RETURN_NOT_OK(data_in->Seek(offset)); PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); - if (version != CURRENT_VERSION) { + if (version != BitSliceIndexBitmap::CURRENT_VERSION) { return Status::Invalid("Unknown BitSliceBitmap Version"); } PAIMON_ASSIGN_OR_RAISE(int8_t slices_size, data_in->ReadValue()); @@ -119,7 +119,7 @@ Status BitSliceIndexBitmap::LoadSlices(const int32_t start, const int32_t end) { } auto indexes_stream = std::make_shared(indexes_->data(), indexes_length_); auto data_in = std::make_unique(indexes_stream); - const auto position = static_cast(2 * sizeof(int32_t) * start); + auto position = static_cast(2 * sizeof(int32_t) * start); PAIMON_RETURN_NOT_OK(data_in->Seek(position)); PAIMON_ASSIGN_OR_RAISE(int32_t offset, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int32_t length, data_in->ReadValue()); @@ -218,7 +218,7 @@ BitSliceIndexBitmap::Appender::Appender(const std::shared_ptr& pool, const int32_t max) : pool_(pool), min_(min), max_(max) { ebm_ = RoaringBitmap32{}; - const auto slices_size = std::max(64 - NumberOfLeadingZeros(max), 1); + int32_t slices_size = std::max(64 - NumberOfLeadingZeros(max), 1); slices_.resize(slices_size); } @@ -239,9 +239,9 @@ Status BitSliceIndexBitmap::Appender::Append(const int32_t key, const int32_t va } Result> BitSliceIndexBitmap::Appender::Serialize() const { - const auto indexes_length = static_cast(2 * sizeof(int32_t) * slices_.size()); - const auto ebm_bytes = ebm_.Serialize(pool_.get()); - const auto ebm_length = static_cast(ebm_bytes->size()); + auto indexes_length = static_cast(2 * sizeof(int32_t) * slices_.size()); + PAIMON_UNIQUE_PTR ebm_bytes = ebm_.Serialize(pool_.get()); + auto ebm_length = static_cast(ebm_bytes->size()); int32_t header_size = 0; header_size += sizeof(int8_t); // version header_size += sizeof(int8_t); // slices size @@ -255,7 +255,7 @@ Result> BitSliceIndexBitmap::Appender::Serialize() cons std::vector> indexes_vector{}; for (const auto& slice : slices_) { auto slice_bytes = slice.Serialize(pool_.get()); - const auto length = static_cast(slice_bytes->size()); + auto length = static_cast(slice_bytes->size()); indexes_vector.emplace_back(offset, length); offset += length; slices_bytes_vector.emplace_back(std::move(slice_bytes)); diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h index 0767b6fc..f2288e12 100644 --- a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h @@ -30,7 +30,6 @@ namespace paimon { class BitSliceIndexBitmap { public: - static constexpr int8_t CURRENT_VERSION = 1; static Result> Create( const std::shared_ptr& pool, const std::shared_ptr& input_stream, int32_t offset); @@ -67,6 +66,9 @@ class BitSliceIndexBitmap { std::vector slices_; }; + public: + static constexpr int8_t CURRENT_VERSION = 1; + private: std::shared_ptr pool_; bool initialized_; diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp index b938ebd9..105e8176 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp @@ -38,9 +38,9 @@ Result> RangeBitmap::Create( const auto data_in = std::make_shared(input_stream); PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); - if (version != CURRENT_VERSION) { + if (version != RangeBitmap::CURRENT_VERSION) { return Status::Invalid(fmt::format("RangeBitmap unsupported version {} (expected {})", - version, CURRENT_VERSION)); + version, RangeBitmap::CURRENT_VERSION)); } PAIMON_ASSIGN_OR_RAISE(int32_t rid, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int32_t cardinality, data_in->ReadValue()); @@ -51,7 +51,7 @@ Result> RangeBitmap::Create( PAIMON_ASSIGN_OR_RAISE(Literal min, key_deserializer(data_in, pool.get())); PAIMON_ASSIGN_OR_RAISE(Literal max, key_deserializer(data_in, pool.get())); PAIMON_ASSIGN_OR_RAISE(int32_t dictionary_length, data_in->ReadValue()); - int32_t dictionary_offset = static_cast(offset + sizeof(int32_t) + header_length); + auto dictionary_offset = static_cast(offset + sizeof(int32_t) + header_length); int32_t bsi_offset = dictionary_offset + dictionary_length; return std::unique_ptr(new RangeBitmap(pool, rid, cardinality, dictionary_offset, bsi_offset, min, max, shared_key_factory, @@ -261,8 +261,8 @@ Result> RangeBitmap::Appender::Serialize() const { } PAIMON_ASSIGN_OR_RAISE(LiteralSerDeUtils::Serializer serializer, LiteralSerDeUtils::CreateValueWriter(factory_->GetFieldType())); - auto min = Literal{factory_->GetFieldType()}; - auto max = Literal{factory_->GetFieldType()}; + Literal min{factory_->GetFieldType()}; + Literal max{factory_->GetFieldType()}; if (!bitmaps_.empty()) { min = bitmaps_.begin()->first; max = bitmaps_.rbegin()->first; @@ -283,7 +283,7 @@ Result> RangeBitmap::Appender::Serialize() const { const auto data_output_stream = std::make_shared( MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); data_output_stream->WriteValue(header_size); - data_output_stream->WriteValue(CURRENT_VERSION); + data_output_stream->WriteValue(RangeBitmap::CURRENT_VERSION); data_output_stream->WriteValue(rid_); data_output_stream->WriteValue(static_cast(bitmaps_.size())); if (!min.IsNull()) { diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.h b/src/paimon/common/file_index/rangebitmap/range_bitmap.h index 5e93766a..7967c4de 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap.h +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.h @@ -35,8 +35,6 @@ class MemoryPool; class RangeBitmap { public: - static constexpr int8_t CURRENT_VERSION = 1; - static Result> Create( const std::shared_ptr& input_stream, int64_t offset, FieldType field_type, const std::shared_ptr& pool); @@ -52,6 +50,9 @@ class RangeBitmap { Result IsNull(); Result IsNotNull(); + public: + static constexpr int8_t CURRENT_VERSION = 1; + private: Result Not(RoaringBitmap32& bitmap); From a7a046abb095ad86e975c827c78f11fff043e7d4 Mon Sep 17 00:00:00 2001 From: xiaoheng Date: Sat, 28 Feb 2026 10:12:59 +0800 Subject: [PATCH 6/6] fix: code style --- .../file_index/rangebitmap/bit_slice_index_bitmap.cpp | 5 +++-- src/paimon/common/file_index/rangebitmap/range_bitmap.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp index 3da6be75..716bd61d 100644 --- a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp @@ -37,8 +37,9 @@ Result> BitSliceIndexBitmap::Create( PAIMON_RETURN_NOT_OK(data_in->Seek(offset)); PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); - if (version != BitSliceIndexBitmap::CURRENT_VERSION) { - return Status::Invalid("Unknown BitSliceBitmap Version"); + if (version != CURRENT_VERSION) { + return Status::Invalid(fmt::format("Unknown BitSliceBitmap version: {}, Expected: {}", + version, CURRENT_VERSION)); } PAIMON_ASSIGN_OR_RAISE(int8_t slices_size, data_in->ReadValue()); auto slices = std::vector>(); diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp index 105e8176..855d728a 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp @@ -38,9 +38,9 @@ Result> RangeBitmap::Create( const auto data_in = std::make_shared(input_stream); PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); - if (version != RangeBitmap::CURRENT_VERSION) { - return Status::Invalid(fmt::format("RangeBitmap unsupported version {} (expected {})", - version, RangeBitmap::CURRENT_VERSION)); + if (version != CURRENT_VERSION) { + return Status::Invalid(fmt::format("RangeBitmap unsupported version {} (Expected {})", + version, CURRENT_VERSION)); } PAIMON_ASSIGN_OR_RAISE(int32_t rid, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int32_t cardinality, data_in->ReadValue()); @@ -283,7 +283,7 @@ Result> RangeBitmap::Appender::Serialize() const { const auto data_output_stream = std::make_shared( MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); data_output_stream->WriteValue(header_size); - data_output_stream->WriteValue(RangeBitmap::CURRENT_VERSION); + data_output_stream->WriteValue(CURRENT_VERSION); data_output_stream->WriteValue(rid_); data_output_stream->WriteValue(static_cast(bitmaps_.size())); if (!min.IsNull()) {