diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index d02fa95f..cd836fbb 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -363,6 +363,7 @@ if(PAIMON_BUILD_TESTS) common/file_index/bsi/bit_slice_index_roaring_bitmap_test.cpp common/file_index/bloomfilter/bloom_filter_file_index_test.cpp common/file_index/bloomfilter/fast_hash_test.cpp + common/file_index/rangebitmap/range_bitmap_file_index_test.cpp common/global_index/complete_index_score_batch_reader_test.cpp common/global_index/global_index_result_test.cpp common/global_index/global_indexer_factory_test.cpp diff --git a/src/paimon/common/file_index/CMakeLists.txt b/src/paimon/common/file_index/CMakeLists.txt index 085b2e45..2daf7195 100644 --- a/src/paimon/common/file_index/CMakeLists.txt +++ b/src/paimon/common/file_index/CMakeLists.txt @@ -23,7 +23,15 @@ set(PAIMON_FILE_INDEX_SRC bsi/bit_slice_index_roaring_bitmap.cpp bloomfilter/bloom_filter_file_index.cpp bloomfilter/bloom_filter_file_index_factory.cpp - bloomfilter/fast_hash.cpp) + bloomfilter/fast_hash.cpp + rangebitmap/range_bitmap_file_index.cpp + rangebitmap/range_bitmap_file_index_factory.cpp + rangebitmap/range_bitmap.cpp + rangebitmap/bit_slice_index_bitmap.cpp + rangebitmap/dictionary/chunked_dictionary.cpp + rangebitmap/dictionary/fixed_length_chunk.cpp + rangebitmap/dictionary/key_factory.cpp + rangebitmap/utils/literal_serialization_utils.cpp) add_paimon_lib(paimon_file_index SOURCES diff --git a/src/paimon/common/file_index/file_index_format_test.cpp b/src/paimon/common/file_index/file_index_format_test.cpp index 2e000324..3eeca392 100644 --- a/src/paimon/common/file_index/file_index_format_test.cpp +++ b/src/paimon/common/file_index/file_index_format_test.cpp @@ -22,9 +22,11 @@ #include "paimon/common/file_index/bloomfilter/bloom_filter_file_index.h" #include "paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.h" #include "paimon/common/file_index/empty/empty_file_index_reader.h" +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" #include "paimon/data/timestamp.h" -#include "paimon/defs.h" #include "paimon/file_index/file_index_result.h" +#include "paimon/file_index/file_indexer_factory.h" #include "paimon/fs/local/local_file_system.h" #include "paimon/io/byte_array_input_stream.h" #include "paimon/memory/memory_pool.h" @@ -149,6 +151,50 @@ TEST_F(FileIndexFormatTest, TestSimple) { } } +// index file generated by paimon Java implementation +// type: int32 +// data: 17,3,5,7,9,null,null,10 +TEST_F(FileIndexFormatTest, TestRangeBitmapCompatibleWithJava) { + const auto schema = arrow::schema({arrow::field("data", arrow::int32())}); + const auto index_file_bytes = + std::make_unique>(std::initializer_list{ + 0, 5, 78, 78, 208, 26, 53, 174, 0, 0, 0, 1, 0, 0, 0, 56, 0, 0, 0, + 1, 0, 4, 100, 97, 116, 97, 0, 0, 0, 1, 0, 12, 114, 97, 110, 103, 101, 45, + 98, 105, 116, 109, 97, 112, 0, 0, 0, 56, 0, 0, 0, 210, 0, 0, 0, 0, 0, + 0, 0, 21, 1, 0, 0, 0, 8, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, + 17, 0, 0, 0, 66, 0, 0, 0, 13, 1, 0, 0, 0, 1, 0, 0, 0, 4, 0, + 0, 0, 25, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 0, 0, 20, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, + 0, 7, 0, 0, 0, 9, 0, 0, 0, 10, 0, 0, 0, 17, 0, 0, 0, 34, 1, + 3, 0, 0, 0, 19, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 22, 0, 0, + 0, 22, 0, 0, 0, 20, 0, 0, 0, 42, 0, 0, 0, 20, 59, 48, 0, 0, 1, + 0, 0, 5, 0, 2, 0, 0, 0, 4, 0, 7, 0, 0, 0, 58, 48, 0, 0, 1, + 0, 0, 0, 0, 0, 2, 0, 16, 0, 0, 0, 0, 0, 2, 0, 4, 0, 58, 48, + 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 16, 0, 0, 0, 3, 0, 4, 0, 58, + 48, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 16, 0, 0, 0, 0, 0, 7, 0, + }); + const auto input_stream = std::make_shared( + reinterpret_cast(index_file_bytes->data()), index_file_bytes->size()); + ASSERT_OK_AND_ASSIGN(const auto reader, FileIndexFormat::CreateReader(input_stream, pool_)); + ASSERT_OK_AND_ASSIGN(const auto index_file_readers, + reader->ReadColumnIndex("data", CreateArrowSchema(schema).get())); + ASSERT_EQ(1, index_file_readers.size()); + auto* range_bitmap_reader = + dynamic_cast(index_file_readers[0].get()); + ASSERT_TRUE(range_bitmap_reader); + + ASSERT_OK_AND_ASSIGN(const auto eq_result, range_bitmap_reader->VisitEqual(Literal(3))); + ASSERT_TRUE(eq_result); + ASSERT_EQ(eq_result->ToString(), "{1}"); + + ASSERT_OK_AND_ASSIGN(const auto lt_result, range_bitmap_reader->VisitLessThan(Literal(10))); + ASSERT_TRUE(lt_result); + ASSERT_EQ(lt_result->ToString(), "{1,2,3,4}"); + + ASSERT_OK_AND_ASSIGN(const auto gt_result, range_bitmap_reader->VisitIsNull()); + ASSERT_EQ(gt_result->ToString(), "{5,6}"); +} + // NOLINTNEXTLINE(google-readability-function-size) TEST_F(FileIndexFormatTest, TestBitmapIndexWithTimestamp) { auto schema = arrow::schema({ @@ -816,5 +862,4 @@ TEST_F(FileIndexFormatTest, TestBitmapIndexWithTimestamp) { check_nano("ts_nano"); check_nano("ts_tz_nano"); } - } // namespace paimon::test diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp new file mode 100644 index 00000000..716bd61d --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp @@ -0,0 +1,281 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h" + +#include + +#include +#include +#include + +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +Result> BitSliceIndexBitmap::Create( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + const int32_t offset) { + auto data_in = std::make_unique(input_stream); + PAIMON_RETURN_NOT_OK(data_in->Seek(offset)); + PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); + if (version != CURRENT_VERSION) { + return Status::Invalid(fmt::format("Unknown BitSliceBitmap version: {}, Expected: {}", + version, CURRENT_VERSION)); + } + PAIMON_ASSIGN_OR_RAISE(int8_t slices_size, data_in->ReadValue()); + auto slices = std::vector>(); + slices.resize(slices_size); + PAIMON_ASSIGN_OR_RAISE(int32_t ebm_size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t indexes_length, data_in->ReadValue()); + auto indexes = Bytes::AllocateBytes(indexes_length, pool.get()); + PAIMON_RETURN_NOT_OK(data_in->Read(indexes->data(), indexes_length)); + auto body_offset = offset + sizeof(int32_t) + header_length; + return std::make_unique(pool, indexes_length, std::move(indexes), ebm_size, + slices_size, input_stream, body_offset); +} + +static int32_t NumberOfLeadingZeros(const int64_t value) { + if (value == 0) { + return 64; + } + return __builtin_clzll(static_cast(value)); +} + +static int32_t NumberOfTrailingZeros(const int64_t value) { + if (value == 0) { + return 64; + } + return __builtin_ctzll(static_cast(value)); +} + +BitSliceIndexBitmap::BitSliceIndexBitmap(const std::shared_ptr& pool, + const int32_t indexes_length, + PAIMON_UNIQUE_PTR indexes, const int32_t ebm_length, + const int32_t slices_size, + const std::shared_ptr& input_stream, + const int32_t body_offset) + : pool_(pool), + initialized_(false), + bit_slices_(std::vector>(slices_size, std::nullopt)), + ebm(std::nullopt), + input_stream_(input_stream), + body_offset_(body_offset), + indexes_(std::move(indexes)), + ebm_length_(ebm_length), + indexes_length_(indexes_length) {} + +Result BitSliceIndexBitmap::GetEmptyBitmap() { + if (!ebm.has_value()) { + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_, FS_SEEK_SET)); + const auto bytes = Bytes::AllocateBytes(ebm_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), ebm_length_)); + RoaringBitmap32 bitmap; + PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data(), ebm_length_)); + ebm = bitmap; + } + return &ebm.value(); +} + +Result BitSliceIndexBitmap::GetSliceBitmap(const int32_t idx) { + if (!bit_slices_[idx].has_value()) { + auto data_in = std::make_unique( + std::make_shared(indexes_->data(), indexes_length_)); + const int position = static_cast(2 * sizeof(int32_t) * idx); + PAIMON_RETURN_NOT_OK(data_in->Seek(position)); + PAIMON_ASSIGN_OR_RAISE(int32_t offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t length, data_in->ReadValue()); + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_ + ebm_length_ + offset, FS_SEEK_SET)); + RoaringBitmap32 bitmap; + const auto bytes = Bytes::AllocateBytes(length, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), length)); + PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data(), length)); + bit_slices_[idx] = bitmap; + } + return &bit_slices_[idx].value(); +} + +/// Batch load slices from start to end to avoid unnecessary IO +Status BitSliceIndexBitmap::LoadSlices(const int32_t start, const int32_t end) { + if (initialized_) { + return Status::OK(); + } + auto indexes_stream = std::make_shared(indexes_->data(), indexes_length_); + auto data_in = std::make_unique(indexes_stream); + auto position = static_cast(2 * sizeof(int32_t) * start); + PAIMON_RETURN_NOT_OK(data_in->Seek(position)); + PAIMON_ASSIGN_OR_RAISE(int32_t offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t length, data_in->ReadValue()); + std::vector lengths(end); + lengths[start] = length; + + for (int32_t i = start + 1; i < end; ++i) { + PAIMON_RETURN_NOT_OK(data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t slice_length, data_in->ReadValue()); + lengths[i] = slice_length; + length += slice_length; + } + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_ + ebm_length_ + offset, FS_SEEK_SET)); + const auto bytes = Bytes::AllocateBytes(length, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), length)); + int32_t byte_position = 0; + for (int32_t i = start; i < end; ++i) { + const int32_t slice_length = lengths[i]; + RoaringBitmap32 bitmap; + PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data() + byte_position, slice_length)); + bit_slices_[i] = std::move(bitmap); + byte_position += slice_length; + } + initialized_ = true; + return Status::OK(); +} + +Result BitSliceIndexBitmap::Eq(const int32_t code) { + PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* empty_bitmap, GetEmptyBitmap()); + auto state = RoaringBitmap32(*empty_bitmap); + if (state.IsEmpty()) { + return RoaringBitmap32(); + } + PAIMON_RETURN_NOT_OK(LoadSlices(0, static_cast(bit_slices_.size()))); + for (int32_t i = 0; i < static_cast(bit_slices_.size()); i++) { + PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* slice_bitmap, GetSliceBitmap(i)); + if ((code >> i & 1) == 1) { + state &= *slice_bitmap; + } else { + state -= *slice_bitmap; + } + } + return state; +} + +Result BitSliceIndexBitmap::Gt(const int32_t code) { + if (code < 0) { + return IsNotNull({}); + } + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 found_set, IsNotNull({})); + if (found_set.IsEmpty()) { + return RoaringBitmap32(); + } + auto state = RoaringBitmap32{}; + auto state_inited = false; + const auto start = NumberOfTrailingZeros(~code); + PAIMON_RETURN_NOT_OK(LoadSlices(start, static_cast(bit_slices_.size()))); + for (int i = start; i < static_cast(bit_slices_.size()); ++i) { + if (!state_inited) { + PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* slice_ptr, GetSliceBitmap(i)); + state = *slice_ptr; + state_inited = true; + continue; + } + const auto bit = code >> i & 1; + PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* slice_ptr, GetSliceBitmap(i)); + if (bit == 1) { + state &= *slice_ptr; + } else { + state |= *slice_ptr; + } + } + if (!state_inited) { + return RoaringBitmap32(); + } + return state &= found_set; +} + +Result BitSliceIndexBitmap::Gte(const int32_t code) { + return Gt(code - 1); +} + +Result BitSliceIndexBitmap::IsNotNull(const RoaringBitmap32& found_set) { + if (!ebm.has_value()) { + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_, FS_SEEK_SET)); + const auto bytes = Bytes::AllocateBytes(ebm_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), ebm_length_)); + RoaringBitmap32 bitmap; + PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data(), ebm_length_)); + ebm = bitmap; + } + return found_set.IsEmpty() ? *ebm : *ebm &= found_set; +} + +BitSliceIndexBitmap::Appender::Appender(const std::shared_ptr& pool, const int32_t min, + const int32_t max) + : pool_(pool), min_(min), max_(max) { + ebm_ = RoaringBitmap32{}; + int32_t slices_size = std::max(64 - NumberOfLeadingZeros(max), 1); + slices_.resize(slices_size); +} + +Status BitSliceIndexBitmap::Appender::Append(const int32_t key, const int32_t value) { + if (key < 0) { + return Status::Invalid(fmt::format("Invalid key: {}", key)); + } + if (value < min_ || value > max_) { + return Status::Invalid(fmt::format("value not in range [{}, {}]", min_, max_)); + } + int bits = value; + while (bits != 0) { + slices_[NumberOfTrailingZeros(bits)].Add(key); + bits &= (bits - 1); + } + ebm_.Add(key); + return Status::OK(); +} + +Result> BitSliceIndexBitmap::Appender::Serialize() const { + auto indexes_length = static_cast(2 * sizeof(int32_t) * slices_.size()); + PAIMON_UNIQUE_PTR ebm_bytes = ebm_.Serialize(pool_.get()); + auto ebm_length = static_cast(ebm_bytes->size()); + int32_t header_size = 0; + header_size += sizeof(int8_t); // version + header_size += sizeof(int8_t); // slices size + header_size += sizeof(int32_t); // ebm length + header_size += sizeof(int32_t); // indexes length + header_size += indexes_length; + int32_t offset = 0; + auto data_output_stream = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + std::vector> slices_bytes_vector{}; + std::vector> indexes_vector{}; + for (const auto& slice : slices_) { + auto slice_bytes = slice.Serialize(pool_.get()); + auto length = static_cast(slice_bytes->size()); + indexes_vector.emplace_back(offset, length); + offset += length; + slices_bytes_vector.emplace_back(std::move(slice_bytes)); + } + data_output_stream->WriteValue(header_size); + data_output_stream->WriteValue(CURRENT_VERSION); + data_output_stream->WriteValue(static_cast(slices_.size())); + data_output_stream->WriteValue(ebm_length); + data_output_stream->WriteValue(indexes_length); + for (const auto& [slice_offset, length] : indexes_vector) { + data_output_stream->WriteValue(slice_offset); + data_output_stream->WriteValue(length); + } + data_output_stream->Write(ebm_bytes->data(), ebm_length); + for (const auto& slice_bytes : slices_bytes_vector) { + data_output_stream->Write(slice_bytes->data(), slice_bytes->size()); + } + return MemorySegmentUtils::CopyToBytes(data_output_stream->Segments(), 0, + static_cast(data_output_stream->CurrentSize()), + pool_.get()); +} +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h new file mode 100644 index 00000000..f2288e12 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h @@ -0,0 +1,84 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/memory/bytes.h" +#include "paimon/result.h" +#include "paimon/status.h" +#include "paimon/utils/roaring_bitmap32.h" + +namespace paimon { + +class BitSliceIndexBitmap { + public: + static Result> Create( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + int32_t offset); + + BitSliceIndexBitmap(const std::shared_ptr& pool, int32_t indexes_length, + PAIMON_UNIQUE_PTR indexes, int32_t ebm_length, int32_t slices_size, + const std::shared_ptr& input_stream, int32_t body_offset); + + Result GetEmptyBitmap(); + + Result GetSliceBitmap(int32_t idx); + + Status LoadSlices(int32_t start, int32_t end); + + Result Eq(int32_t code); + + Result Gt(int32_t code); + + Result Gte(int32_t code); + + Result IsNotNull(const RoaringBitmap32& found_set); + + class Appender { + public: + Appender(const std::shared_ptr& pool, int32_t min, int32_t max); + Status Append(int32_t key, int32_t value); + Result> Serialize() const; + + private: + std::shared_ptr pool_; + int32_t min_; + int32_t max_; + RoaringBitmap32 ebm_; + std::vector slices_; + }; + + public: + static constexpr int8_t CURRENT_VERSION = 1; + + private: + std::shared_ptr pool_; + bool initialized_; + std::vector> bit_slices_; + std::optional ebm; + std::shared_ptr input_stream_; + int32_t body_offset_; + PAIMON_UNIQUE_PTR indexes_; + int32_t ebm_length_; + int32_t indexes_length_; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h b/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h new file mode 100644 index 00000000..055232ca --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunk.h @@ -0,0 +1,88 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include "paimon/memory/bytes.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" + +namespace paimon { + +class Chunk { + public: + virtual ~Chunk() = default; + + virtual Result TryAdd(const Literal& key) = 0; + + virtual Result Find(const Literal& key) { + PAIMON_ASSIGN_OR_RAISE(int32_t cmp_with_key, Key().CompareTo(key)); + if (cmp_with_key == 0) { + return Code(); + } + int32_t low = 0; + int32_t high = Size() - 1; + const int32_t base = Code() + 1; + while (low <= high) { + const int32_t mid = low + (high - low) / 2; + PAIMON_ASSIGN_OR_RAISE(Literal key_at_mid, GetKey(mid)); + PAIMON_ASSIGN_OR_RAISE(int32_t cmp, key_at_mid.CompareTo(key)); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return base + mid; + } + } + return -(base + low + 1); + } + + virtual Result Find(const int32_t code) { + const auto current = Code(); + if (current == code) { + return Key(); + } + const auto index = code - current - 1; + if (index < 0 || index >= Size()) { + return Status::Invalid(fmt::format("Invalid Code {}", code)); + } + return GetKey(index); + } + + virtual const Literal& Key() const = 0; + + virtual int32_t Code() const = 0; + + virtual int32_t Offset() const = 0; + + virtual void SetOffset(int32_t offset) = 0; + + virtual int32_t Size() const = 0; + + virtual Result> SerializeChunk() const = 0; + + virtual Result> SerializeKeys() const = 0; + + protected: + virtual Result GetKey(int32_t index) = 0; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp new file mode 100644 index 00000000..5439a7cc --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.cpp @@ -0,0 +1,238 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h" + +#include + +#include "fmt/format.h" +#include "paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/fs/file_system.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/memory/bytes.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +/// Firstly do a binary search on chunk representative key +/// If found, return, otherwise, do a binary search inside the chunk +Result ChunkedDictionary::Find(const Literal& key) { + int32_t low = 0; + int32_t high = size_ - 1; + while (low <= high) { + const int32_t mid = low + (high - low) / 2; + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr chunk, GetChunk(mid)); + PAIMON_ASSIGN_OR_RAISE(const int32_t result, chunk->Key().CompareTo(key)); + if (result > 0) { + high = mid - 1; + } else if (result < 0) { + low = mid + 1; + } else { + return chunk->Code(); + } + } + if (low == 0) { + return -(low + 1); + } + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr prev_chunk, GetChunk(low - 1)); + return prev_chunk->Find(key); +} + +Result ChunkedDictionary::Find(const int32_t code) { + if (code < 0) { + return Status::Invalid(fmt::format("Invalid code: {}", code)); + } + int32_t low = 0; + int32_t high = size_ - 1; + + while (low <= high) { + const int32_t mid = low + (high - low) / 2; + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr chunk, GetChunk(mid)); + + auto const chunk_code = chunk->Code(); + if (chunk_code > code) { + high = mid - 1; + } else if (chunk_code < code) { + low = mid + 1; + } else { + return {chunk->Key()}; + } + } + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr prev_chunk, GetChunk(low - 1)); + return prev_chunk->Find(code); +} + +/// MMap a chunk and store it into cache, keys in the chunk will be lazy loaded later +Result> ChunkedDictionary::GetChunk(int32_t index) { + if (index < 0 || index >= size_) { + return Status::Invalid(fmt::format("Invalid chunk index: {}", index)); + } + if (offsets_bytes_ == nullptr || chunks_bytes_ == nullptr) { + PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_, FS_SEEK_SET)); + auto offsets = Bytes::AllocateBytes(offsets_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(offsets->data(), offsets_length_)); + offsets_bytes_ = std::move(offsets); + auto chunks = Bytes::AllocateBytes(chunks_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(chunks->data(), chunks_length_)); + chunks_bytes_ = std::move(chunks); + } + if (chunks_cache_[index]) { + return chunks_cache_[index]; + } + auto data_in = std::make_unique( + std::make_shared(offsets_bytes_->data(), offsets_length_)); + PAIMON_RETURN_NOT_OK(data_in->Seek(sizeof(int32_t) * index)); + PAIMON_ASSIGN_OR_RAISE(int32_t chunk_offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr chunk, + factory_->MmapChunk(pool_, input_stream_, body_offset_ + offsets_length_ + chunk_offset, + body_offset_ + chunks_length_ + offsets_length_)); + chunks_cache_[index] = std::move(chunk); + return chunks_cache_[index]; +} + +ChunkedDictionary::Appender::Appender(const std::shared_ptr& pool, + const std::shared_ptr& key_factory, + const int32_t chunk_size_bytes) + : pool_(pool), + key_factory_(key_factory), + chunk_size_bytes_(chunk_size_bytes), + chunk_(nullptr), + size_(0), + key_offset_(0), + chunks_offset_(0) { + chunks_output_ = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + keys_output_ = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + offsets_output_ = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); +} + +Status ChunkedDictionary::Appender::AppendSorted(const Literal& key, int32_t code) { + if (key.IsNull()) { + return Status::Invalid("key should not be null"); + } + if (last_key_.has_value()) { + PAIMON_ASSIGN_OR_RAISE(int32_t compare_result, last_key_->CompareTo(key)); + if (compare_result >= 0) { + return Status::Invalid("key must be in sorted order"); + } + } + if (last_code_.has_value() && code <= last_code_) { + return Status::Invalid("code must be in sorted order"); + } + last_key_ = key; + last_code_ = code; + if (chunk_ == nullptr) { + PAIMON_ASSIGN_OR_RAISE(chunk_, + key_factory_->CreateChunk(pool_, key, code, chunk_size_bytes_)); + } else { + PAIMON_ASSIGN_OR_RAISE(bool success, chunk_->TryAdd(key)); + if (success) { + return Status::OK(); + } + PAIMON_RETURN_NOT_OK(Flush()); + PAIMON_ASSIGN_OR_RAISE(chunk_, + key_factory_->CreateChunk(pool_, key, code, chunk_size_bytes_)); + } + return Status::OK(); +} + +Result> ChunkedDictionary::Appender::Serialize() { + if (chunk_ != nullptr) { + PAIMON_RETURN_NOT_OK(Flush()); + } + int32_t header_size = 0; + header_size += sizeof(int8_t); // version + header_size += sizeof(int32_t); // size + header_size += sizeof(int32_t); // offsets length + header_size += sizeof(int32_t); // chunks length + auto data_out = std::make_unique( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + data_out->WriteValue(header_size); + data_out->WriteValue(CURRENT_VERSION); + data_out->WriteValue(size_); + data_out->WriteValue(static_cast(offsets_output_->CurrentSize())); + data_out->WriteValue(static_cast(chunks_output_->CurrentSize())); + PAIMON_RETURN_NOT_OK(MemorySegmentUtils::CopyToStream( + offsets_output_->Segments(), 0, static_cast(offsets_output_->CurrentSize()), + data_out.get())); + PAIMON_RETURN_NOT_OK(MemorySegmentUtils::CopyToStream( + chunks_output_->Segments(), 0, static_cast(chunks_output_->CurrentSize()), + data_out.get())); + PAIMON_RETURN_NOT_OK(MemorySegmentUtils::CopyToStream( + keys_output_->Segments(), 0, static_cast(keys_output_->CurrentSize()), + data_out.get())); + return MemorySegmentUtils::CopyToBytes( + data_out->Segments(), 0, static_cast(data_out->CurrentSize()), pool_.get()); +} + +Status ChunkedDictionary::Appender::Flush() { + chunk_->SetOffset(key_offset_); + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR chunks_bytes, chunk_->SerializeChunk()); + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR keys_bytes, chunk_->SerializeKeys()); + offsets_output_->WriteValue(chunks_offset_); + chunks_offset_ += static_cast(chunks_bytes->size()); + key_offset_ += static_cast(keys_bytes->size()); + chunks_output_->Write(chunks_bytes->data(), chunks_bytes->size()); + keys_output_->Write(keys_bytes->data(), keys_bytes->size()); + size_ += 1; + chunk_ = nullptr; + return Status::OK(); +} + +Result> ChunkedDictionary::Create( + const std::shared_ptr& pool, const FieldType field_type, + const std::shared_ptr& input_stream, const int64_t offset) { + auto data_in = std::make_unique(input_stream); + PAIMON_RETURN_NOT_OK(data_in->Seek(offset)); + PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); + if (version != CURRENT_VERSION) { + return Status::Invalid("Unknown version of ChunkedDictionary"); + } + PAIMON_ASSIGN_OR_RAISE(int32_t size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t offsets_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t chunks_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr factory_shared, + KeyFactory::Create(field_type)); + auto result = std::make_unique(pool, input_stream, factory_shared, size, + offsets_length, chunks_length, + offset + header_length + sizeof(int32_t)); + return result; +} + +ChunkedDictionary::ChunkedDictionary(const std::shared_ptr& pool, + const std::shared_ptr& input_stream, + const std::shared_ptr& factory, const int32_t size, + const int32_t offsets_length, const int32_t chunks_length, + const int64_t body_offset) + : pool_(pool), + factory_(factory), + input_stream_(input_stream), + size_(size), + offsets_length_(offsets_length), + chunks_length_(chunks_length), + body_offset_(body_offset), + offsets_bytes_(nullptr), + chunks_bytes_(nullptr), + chunks_cache_(std::vector>(size)) {} +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h new file mode 100644 index 00000000..801c3b04 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h @@ -0,0 +1,96 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/rangebitmap/dictionary/chunk.h" +#include "paimon/common/file_index/rangebitmap/dictionary/dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/result.h" + +namespace paimon { + +class InputStream; +class MemoryPool; + +class ChunkedDictionary final : public Dictionary { + public: + static constexpr int8_t CURRENT_VERSION = 1; + + Result Find(const Literal& key) override; + + Result Find(int32_t code) override; + + Result> GetChunk(int32_t index); + + class Appender final : public Dictionary::Appender { + public: + Appender(const std::shared_ptr& pool, + const std::shared_ptr& key_factory, int32_t chunk_size_bytes); + Status AppendSorted(const Literal& key, int32_t code) override; + Result> Serialize() override; + + private: + Status Flush(); + + private: + std::shared_ptr pool_; + std::shared_ptr key_factory_; + int32_t chunk_size_bytes_; + std::optional last_key_; + std::optional last_code_; + std::unique_ptr chunk_; + int32_t size_; + int32_t key_offset_; + int32_t chunks_offset_; + std::unique_ptr chunks_output_; + std::unique_ptr keys_output_; + std::unique_ptr offsets_output_; + }; + + static Result> Create( + const std::shared_ptr& pool, FieldType field_type, + const std::shared_ptr& input_stream, int64_t offset); + + ChunkedDictionary(const std::shared_ptr& pool, + const std::shared_ptr& input_stream, + const std::shared_ptr& factory, int32_t size, + int32_t offsets_length, int32_t chunks_length, int64_t body_offset); + + private: + std::shared_ptr pool_; + std::shared_ptr factory_; + + std::shared_ptr input_stream_; + int32_t size_; // number of chunks + int32_t offsets_length_; // bytes length of offsets + int32_t chunks_length_; // bytes length of chunks + int64_t body_offset_; // where offsets start + + // for lazy loading + PAIMON_UNIQUE_PTR offsets_bytes_; + PAIMON_UNIQUE_PTR chunks_bytes_; + + // mmap chunks cache + std::vector> chunks_cache_; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h b/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h new file mode 100644 index 00000000..5f9a732b --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/dictionary.h @@ -0,0 +1,45 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "paimon/memory/bytes.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" + +namespace paimon { + +class Dictionary { + public: + virtual ~Dictionary() = default; + + virtual Result Find(const Literal& key) = 0; + + virtual Result Find(int32_t code) = 0; + + class Appender { + public: + virtual ~Appender() = default; + + virtual Status AppendSorted(const Literal& key, int32_t code) = 0; + + virtual Result> Serialize() = 0; + }; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp new file mode 100644 index 00000000..cb774771 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.cpp @@ -0,0 +1,130 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h" + +#include +#include + +#include "fmt/format.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/memory/bytes.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +Result FixedLengthChunk::TryAdd(const Literal& key) { + if (keys_stream_out_ == nullptr) { + keys_stream_out_ = std::make_shared( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + PAIMON_ASSIGN_OR_RAISE(serializer_, + LiteralSerDeUtils::CreateValueWriter(factory_->GetFieldType())); + } + if (fixed_length_ > remaining_keys_size_) { + return false; + } + PAIMON_RETURN_NOT_OK(serializer_(keys_stream_out_, key)); + remaining_keys_size_ -= fixed_length_; + size_ += 1; + return true; +} + +Result FixedLengthChunk::GetKey(const int32_t index) { + if (index < 0 || index >= size_) { + return Status::Invalid("Index out of bounds"); + } + if (keys_ == nullptr) { + PAIMON_RETURN_NOT_OK(input_stream_->Seek(keys_base_offset_ + offset_, FS_SEEK_SET)); + keys_ = Bytes::AllocateBytes(keys_length_, pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(keys_->data(), keys_length_)); + PAIMON_ASSIGN_OR_RAISE(deserializer_, + LiteralSerDeUtils::CreateValueReader(factory_->GetFieldType())); + keys_stream_in_ = std::make_shared( + std::make_shared(keys_->data(), keys_length_)); + } + PAIMON_RETURN_NOT_OK(keys_stream_in_->Seek(index * fixed_length_)); + return deserializer_(keys_stream_in_, pool_.get()); +} + +Result> FixedLengthChunk::SerializeChunk() const { + const auto data_out = std::make_shared( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + data_out->WriteValue(CURRENT_VERSION); + PAIMON_RETURN_NOT_OK(serializer_(data_out, key_)); + data_out->WriteValue(code_); + data_out->WriteValue(offset_); + data_out->WriteValue(size_); + data_out->WriteValue(static_cast(keys_stream_out_->CurrentSize())); + data_out->WriteValue(fixed_length_); + return MemorySegmentUtils::CopyToBytes( + data_out->Segments(), 0, static_cast(data_out->CurrentSize()), pool_.get()); +} + +Result> FixedLengthChunk::SerializeKeys() const { + return MemorySegmentUtils::CopyToBytes(keys_stream_out_->Segments(), 0, + static_cast(keys_stream_out_->CurrentSize()), + pool_.get()); +} + +// Read path +FixedLengthChunk::FixedLengthChunk(const std::shared_ptr& pool, Literal key, + const int32_t code, const int32_t offset, const int32_t size, + const std::shared_ptr& factory, + const std::shared_ptr& input_stream, + const int32_t keys_base_offset, const int32_t keys_length, + const int32_t fixed_length) + : pool_(pool), + key_(std::move(key)), + code_(code), + offset_(offset), + size_(size), + factory_(factory), + input_stream_(input_stream), + keys_base_offset_(keys_base_offset), + keys_length_(keys_length), + fixed_length_(fixed_length), + deserializer_({}), + keys_stream_in_(nullptr), + keys_(nullptr), + remaining_keys_size_(0) {} + +// Write path +FixedLengthChunk::FixedLengthChunk(const std::shared_ptr& pool, Literal key, + const int32_t code, const int32_t keys_length_limit, + const std::shared_ptr& factory, + const int32_t fixed_length) + : pool_(pool), + key_(std::move(key)), + code_(code), + offset_(0), + size_(0), + factory_(factory), + input_stream_(nullptr), + keys_base_offset_(0), + keys_length_(0), + fixed_length_(fixed_length), + deserializer_({}), + keys_stream_in_(nullptr), + keys_(nullptr), + remaining_keys_size_(keys_length_limit) {} + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h new file mode 100644 index 00000000..10f90db6 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h @@ -0,0 +1,94 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/rangebitmap/dictionary/chunk.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" +#include "paimon/fs/file_system.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { +class DataInputStream; +class InputStream; +class MemoryPool; + +class FixedLengthChunk final : public Chunk { + public: + static constexpr int8_t CURRENT_VERSION = 1; + + Result TryAdd(const Literal& key) override; + Result GetKey(int32_t index) override; + const Literal& Key() const override { + return key_; + } + int32_t Code() const override { + return code_; + } + int32_t Offset() const override { + return offset_; + } + + void SetOffset(const int32_t offset) override { + offset_ = offset; + } + + int32_t Size() const override { + return size_; + } + Result> SerializeChunk() const override; + Result> SerializeKeys() const override; + // For Read Path + FixedLengthChunk(const std::shared_ptr& pool, Literal key, int32_t code, + int32_t offset, int32_t size, const std::shared_ptr& factory, + const std::shared_ptr& input_stream, int32_t keys_base_offset, + int32_t keys_length, int32_t fixed_length); + // For Write Path + FixedLengthChunk(const std::shared_ptr& pool, Literal key, int32_t code, + int32_t keys_length_limit, const std::shared_ptr& factory, + int32_t fixed_length); + + private: + std::shared_ptr pool_; + Literal key_; // representative key for binary search + int32_t code_; // first code in this chunk + int32_t offset_; // offset of this chunk + int32_t size_; // number of keys in this chunk + std::shared_ptr factory_; + + // For read path lazy keys loading + std::shared_ptr input_stream_; + int32_t keys_base_offset_; + int32_t keys_length_; + int32_t fixed_length_; + LiteralSerDeUtils::Deserializer deserializer_; + std::shared_ptr keys_stream_in_; + PAIMON_UNIQUE_PTR keys_; + + // For write path + LiteralSerDeUtils::Serializer serializer_; + std::shared_ptr keys_stream_out_; + int64_t remaining_keys_size_; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp new file mode 100644 index 00000000..c49a6f12 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.cpp @@ -0,0 +1,92 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" + +#include "fmt/format.h" +#include "paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/fixed_length_chunk.h" +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" +#include "paimon/common/utils/field_type_utils.h" + +namespace paimon { + +Result> KeyFactory::Create(const FieldType field_type) { + // todo: support timestamp + switch (field_type) { + case FieldType::BOOLEAN: + return std::make_shared(); + case FieldType::TINYINT: + return std::make_shared(); + case FieldType::SMALLINT: + return std::make_shared(); + case FieldType::DATE: + return std::make_shared(); + case FieldType::INT: + return std::make_shared(); + case FieldType::BIGINT: + return std::make_shared(); + case FieldType::FLOAT: + return std::make_shared(); + case FieldType::DOUBLE: + return std::make_shared(); + default: + return Status::Invalid(fmt::format("Unsupported field type for KeyFactory: {}", + FieldTypeUtils::FieldTypeToString(field_type))); + } +} + +Result> FixedLengthKeyFactory::CreateChunk( + const std::shared_ptr& pool, const Literal& key, const int32_t code, + const int32_t keys_length_limit) { + return std::make_unique(pool, key, code, keys_length_limit, + this->shared_from_this(), this->GetFieldSize()); +} + +Result> FixedLengthKeyFactory::MmapChunk( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + const int32_t chunk_offest, const int32_t keys_base_offset) { + PAIMON_RETURN_NOT_OK(input_stream->Seek(chunk_offest, FS_SEEK_SET)); + PAIMON_ASSIGN_OR_RAISE(LiteralSerDeUtils::Deserializer deserializer, + LiteralSerDeUtils::CreateValueReader(GetFieldType())); + const auto data_in = std::make_shared(input_stream); + PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); + if (version != ChunkedDictionary::CURRENT_VERSION) { + return Status::Invalid(fmt::format("Unsupported version for KeyFactory: {}", version)); + } + PAIMON_ASSIGN_OR_RAISE(Literal key_literal, deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(int32_t code, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t offset, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t size, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t keys_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t fixed_length, data_in->ReadValue()); + return std::make_unique(pool, key_literal, code, offset, size, + this->shared_from_this(), input_stream, + keys_base_offset, keys_length, fixed_length); +} + +Result> VariableLengthKeyFactory::CreateChunk( + const std::shared_ptr& pool, const Literal& key, int32_t code, + int32_t keys_length_limit) { + return Status::NotImplemented("VariableLengthKeyFactory::CreateChunk not implemented"); +} +Result> VariableLengthKeyFactory::MmapChunk( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + int32_t chunk_offest, int32_t keys_base_offset) { + return Status::NotImplemented("VariableLengthKeyFactory::MmapChunk not implemented"); +} + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h new file mode 100644 index 00000000..75e1752f --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h @@ -0,0 +1,164 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "paimon/common/file_index/rangebitmap/dictionary/chunk.h" +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/defs.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" + +namespace paimon { + +class InputStream; +class MemoryPool; + +class KeyFactory : public std::enable_shared_from_this { + public: + virtual ~KeyFactory() = default; + + virtual FieldType GetFieldType() const = 0; + + /// For writing new chunk + virtual Result> CreateChunk(const std::shared_ptr& pool, + const Literal& key, int32_t code, + int32_t keys_length_limit) = 0; + + /// For reading existing chunk, lazy loading keys in the directory + virtual Result> MmapChunk( + const std::shared_ptr& pool, const std::shared_ptr& input_stream, + int32_t chunk_offest, int32_t keys_base_offset) = 0; + + static Result> Create(FieldType field_type); + + public: + static constexpr char DEFAULT_CHUNK_SIZE[] = "16kb"; +}; + +class FixedLengthKeyFactory : public KeyFactory { + public: + Result> CreateChunk(const std::shared_ptr& pool, + const Literal& key, int32_t code, + int32_t keys_length_limit) override; + Result> MmapChunk(const std::shared_ptr& pool, + const std::shared_ptr& input_stream, + int32_t chunk_offest, + int32_t keys_base_offset) override; + virtual size_t GetFieldSize() const = 0; +}; + +class VariableLengthKeyFactory : public KeyFactory { + public: + Result> CreateChunk(const std::shared_ptr& pool, + const Literal& key, int32_t code, + int32_t keys_length_limit) override; + Result> MmapChunk(const std::shared_ptr& pool, + const std::shared_ptr& input_stream, + int32_t chunk_offest, + int32_t keys_base_offset) override; +}; + +class DateKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::DATE; + } + size_t GetFieldSize() const override { + return sizeof(int32_t); + } +}; + +class IntKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::INT; + } + size_t GetFieldSize() const override { + return sizeof(int32_t); + } +}; + +class BigIntKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::BIGINT; + } + size_t GetFieldSize() const override { + return sizeof(int64_t); + } +}; + +class BooleanKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::BOOLEAN; + } + size_t GetFieldSize() const override { + return sizeof(bool); + } +}; + +class TinyIntKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::TINYINT; + } + size_t GetFieldSize() const override { + return sizeof(int8_t); + } +}; + +class SmallIntKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::SMALLINT; + } + size_t GetFieldSize() const override { + return sizeof(int16_t); + } +}; + +class FloatKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::FLOAT; + } + size_t GetFieldSize() const override { + return sizeof(float); + } +}; + +class DoubleKeyFactory final : public FixedLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::DOUBLE; + } + size_t GetFieldSize() const override { + return sizeof(double); + } +}; + +class StringKeyFactory final : public VariableLengthKeyFactory { + public: + FieldType GetFieldType() const override { + return FieldType::STRING; + } +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp new file mode 100644 index 00000000..855d728a --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp @@ -0,0 +1,302 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" + +#include + +#include "fmt/format.h" +#include "paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" +#include "paimon/common/io/data_output_stream.h" +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/memory/bytes.h" + +namespace paimon { + +Result> RangeBitmap::Create( + const std::shared_ptr& input_stream, const int64_t offset, + const FieldType field_type, const std::shared_ptr& pool) { + PAIMON_RETURN_NOT_OK(input_stream->Seek(offset, SeekOrigin::FS_SEEK_SET)); + const auto data_in = std::make_shared(input_stream); + PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); + if (version != CURRENT_VERSION) { + return Status::Invalid(fmt::format("RangeBitmap unsupported version {} (Expected {})", + version, CURRENT_VERSION)); + } + PAIMON_ASSIGN_OR_RAISE(int32_t rid, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t cardinality, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr shared_key_factory, + KeyFactory::Create(field_type)); + PAIMON_ASSIGN_OR_RAISE(LiteralSerDeUtils::Deserializer key_deserializer, + LiteralSerDeUtils::CreateValueReader(field_type)); + PAIMON_ASSIGN_OR_RAISE(Literal min, key_deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(Literal max, key_deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(int32_t dictionary_length, data_in->ReadValue()); + auto dictionary_offset = static_cast(offset + sizeof(int32_t) + header_length); + int32_t bsi_offset = dictionary_offset + dictionary_length; + return std::unique_ptr(new RangeBitmap(pool, rid, cardinality, dictionary_offset, + bsi_offset, min, max, shared_key_factory, + input_stream)); +} + +Result RangeBitmap::Not(RoaringBitmap32& bitmap) { + bitmap.Flip(0, rid_); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 is_not_null, this->IsNotNull()); + return bitmap &= is_not_null; +} + +Result RangeBitmap::Eq(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); + PAIMON_ASSIGN_OR_RAISE(BitSliceIndexBitmap* const bit_slice_ptr, this->GetBitSliceIndex()); + if (min_compare == 0 && max_compare == 0) { + return bit_slice_ptr->IsNotNull({}); + } + if (min_compare < 0 || max_compare > 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(Dictionary* const dictionary, this->GetDictionary()); + PAIMON_ASSIGN_OR_RAISE(int32_t code, dictionary->Find(key)); + if (code < 0) { + return RoaringBitmap32(); + } + return bit_slice_ptr->Eq(code); +} + +Result RangeBitmap::Neq(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 eq_result, Eq(key)); + return Not(eq_result); +} + +Result RangeBitmap::Lt(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); + if (max_compare > 0) { + return IsNotNull(); + } + if (min_compare <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 gte_result, Gte(key)); + return Not(gte_result); +} + +Result RangeBitmap::Lte(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); + if (max_compare >= 0) { + return IsNotNull(); + } + if (min_compare < 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 gt_result, Gt(key)); + return Not(gt_result); +} + +Result RangeBitmap::Gt(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); + if (max_compare >= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + if (min_compare < 0) { + return IsNotNull(); + } + PAIMON_ASSIGN_OR_RAISE(Dictionary* const dictionary, this->GetDictionary()); + PAIMON_ASSIGN_OR_RAISE(int32_t code, dictionary->Find(key)); + PAIMON_ASSIGN_OR_RAISE(BitSliceIndexBitmap* const bit_slice_ptr, this->GetBitSliceIndex()); + if (code >= 0) { + return bit_slice_ptr->Gt(code); + } + return bit_slice_ptr->Gte(-code - 1); +} + +Result RangeBitmap::Gte(const Literal& key) { + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 gt_result, Gt(key)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 eq_result, Eq(key)); + gt_result |= eq_result; + return gt_result; +} + +Result RangeBitmap::In(const std::vector& keys) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + RoaringBitmap32 result{}; + for (const auto& key : keys) { + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 bitmap, Eq(key)); + result |= bitmap; + } + return result; +} + +Result RangeBitmap::NotIn(const std::vector& keys) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 in_result, In(keys)); + return Not(in_result); +} + +Result RangeBitmap::IsNull() { + if (cardinality_ <= 0) { + if (rid_ > 0) { + RoaringBitmap32 result; + result.AddRange(0, rid_); + return result; + } + return RoaringBitmap32(); + } + + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 non_null_bitmap, IsNotNull()); + non_null_bitmap.Flip(0, rid_); + return non_null_bitmap; +} + +Result RangeBitmap::IsNotNull() { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + + PAIMON_ASSIGN_OR_RAISE(BitSliceIndexBitmap* const bit_slice_ptr, this->GetBitSliceIndex()); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 result, bit_slice_ptr->IsNotNull({})); + return result; +} + +RangeBitmap::RangeBitmap(const std::shared_ptr& pool, const int32_t rid, + const int32_t cardinality, const int32_t dictionary_offset, + const int32_t bsi_offset, const Literal& min, const Literal& max, + const std::shared_ptr& key_factory, + const std::shared_ptr& input_stream) + : pool_(pool), + rid_(rid), + cardinality_(cardinality), + bsi_offset_(bsi_offset), + dictionary_offset_(dictionary_offset), + min_(min), + max_(max), + key_factory_(key_factory), + input_stream_(input_stream), + bsi_(nullptr), + dictionary_(nullptr) {} + +Result RangeBitmap::GetBitSliceIndex() { + if (bsi_ == nullptr) { + PAIMON_ASSIGN_OR_RAISE(bsi_, + BitSliceIndexBitmap::Create(pool_, input_stream_, bsi_offset_)); + } + return bsi_.get(); +} + +Result RangeBitmap::GetDictionary() { + if (dictionary_ == nullptr) { + PAIMON_ASSIGN_OR_RAISE(dictionary_, + ChunkedDictionary::Create(pool_, key_factory_->GetFieldType(), + input_stream_, dictionary_offset_)); + } + return dictionary_.get(); +} + +RangeBitmap::Appender::Appender(const std::shared_ptr& pool, + const std::shared_ptr& factory, + const int64_t limited_serialized_size_in_bytes) + : pool_(pool), + rid_(0), + factory_(factory), + limited_serialized_size_in_bytes_(limited_serialized_size_in_bytes) {} + +void RangeBitmap::Appender::Append(const Literal& key) { + if (!key.IsNull()) { + bitmaps_[key].Add(rid_); + } + rid_++; +} + +Result> RangeBitmap::Appender::Serialize() const { + int32_t code = 0; + auto bsi = BitSliceIndexBitmap::Appender(pool_, 0, static_cast(bitmaps_.size() - 1)); + auto dictionary = ChunkedDictionary::Appender( + pool_, factory_, static_cast(limited_serialized_size_in_bytes_)); + for (const auto& [key, bitmap] : bitmaps_) { + PAIMON_RETURN_NOT_OK(dictionary.AppendSorted(key, code)); + for (auto it = bitmap.Begin(); it != bitmap.End(); ++it) { + PAIMON_RETURN_NOT_OK(bsi.Append(*it, code)); + } + code++; + } + PAIMON_ASSIGN_OR_RAISE(LiteralSerDeUtils::Serializer serializer, + LiteralSerDeUtils::CreateValueWriter(factory_->GetFieldType())); + Literal min{factory_->GetFieldType()}; + Literal max{factory_->GetFieldType()}; + if (!bitmaps_.empty()) { + min = bitmaps_.begin()->first; + max = bitmaps_.rbegin()->first; + } + PAIMON_ASSIGN_OR_RAISE(int32_t min_size, LiteralSerDeUtils::GetSerializedSizeInBytes(min)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_size, LiteralSerDeUtils::GetSerializedSizeInBytes(max)); + int32_t header_size = 0; + header_size += sizeof(int8_t); // version + header_size += sizeof(int32_t); // rid + header_size += sizeof(int32_t); // cardinality + header_size += min.IsNull() ? 0 : min_size; // min literal size + header_size += max.IsNull() ? 0 : max_size; // max literal size + header_size += sizeof(int32_t); // dictionary length + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR dictionary_bytes, dictionary.Serialize()); + auto dictionary_length = static_cast(dictionary_bytes->size()); + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR bsi_bytes, bsi.Serialize()); + size_t bsi_length = bsi_bytes->size(); + const auto data_output_stream = std::make_shared( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + data_output_stream->WriteValue(header_size); + data_output_stream->WriteValue(CURRENT_VERSION); + data_output_stream->WriteValue(rid_); + data_output_stream->WriteValue(static_cast(bitmaps_.size())); + if (!min.IsNull()) { + PAIMON_RETURN_NOT_OK(serializer(data_output_stream, min)); + } + if (!max.IsNull()) { + PAIMON_RETURN_NOT_OK(serializer(data_output_stream, max)); + } + data_output_stream->WriteValue(dictionary_length); + data_output_stream->Write(dictionary_bytes->data(), dictionary_length); + data_output_stream->Write(bsi_bytes->data(), bsi_length); + return MemorySegmentUtils::CopyToBytes(data_output_stream->Segments(), 0, + static_cast(data_output_stream->CurrentSize()), + pool_.get()); +} +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.h b/src/paimon/common/file_index/rangebitmap/range_bitmap.h new file mode 100644 index 00000000..7967c4de --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.h @@ -0,0 +1,105 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h" +#include "paimon/common/file_index/rangebitmap/dictionary/dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/utils/roaring_bitmap32.h" + +namespace paimon { + +class InputStream; +class MemoryPool; + +class RangeBitmap { + public: + static Result> Create( + const std::shared_ptr& input_stream, int64_t offset, FieldType field_type, + const std::shared_ptr& pool); + + Result Eq(const Literal& key); + Result Neq(const Literal& key); + Result Lt(const Literal& key); + Result Lte(const Literal& key); + Result Gt(const Literal& key); + Result Gte(const Literal& key); + Result In(const std::vector& keys); + Result NotIn(const std::vector& keys); + Result IsNull(); + Result IsNotNull(); + + public: + static constexpr int8_t CURRENT_VERSION = 1; + + private: + Result Not(RoaringBitmap32& bitmap); + + RangeBitmap(const std::shared_ptr& pool, int32_t rid, int32_t cardinality, + int32_t dictionary_offset, int32_t bsi_offset, const Literal& min, + const Literal& max, const std::shared_ptr& key_factory, + const std::shared_ptr& input_stream); + Result GetBitSliceIndex(); + Result GetDictionary(); + + private: + std::shared_ptr pool_; + int32_t rid_; + int32_t cardinality_; + int32_t bsi_offset_; + int32_t dictionary_offset_; + Literal min_; + Literal max_; + std::shared_ptr key_factory_; + std::shared_ptr input_stream_; + + // For lazy loading + std::unique_ptr bsi_; + std::unique_ptr dictionary_; + + public: + class Appender { + public: + Appender(const std::shared_ptr& pool, + const std::shared_ptr& factory, + int64_t limited_serialized_size_in_bytes); + void Append(const Literal& key); + Result> Serialize() const; + + private: + struct LiteralComparator { + bool operator()(const Literal& lhs, const Literal& rhs) const { + const auto result = lhs.CompareTo(rhs); + return result.ok() && result.value() < 0; + } + }; + std::shared_ptr pool_; + int32_t rid_; + std::map bitmaps_; + std::shared_ptr factory_; + int64_t limited_serialized_size_in_bytes_; + }; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp new file mode 100644 index 00000000..2b9f9bfd --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp @@ -0,0 +1,222 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" + +#include +#include + +#include + +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" +#include "paimon/common/options/memory_size.h" +#include "paimon/common/predicate/literal_converter.h" +#include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/file_index/bitmap_index_result.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +RangeBitmapFileIndex::RangeBitmapFileIndex(const std::map& options) + : options_(options) {} + +Result> RangeBitmapFileIndex::CreateReader( + ::ArrowSchema* const arrow_schema, const int32_t start, const int32_t length, + const std::shared_ptr& input_stream, + const std::shared_ptr& pool) const { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_schema_ptr, + arrow::ImportSchema(arrow_schema)); + if (arrow_schema_ptr->num_fields() != 1) { + return Status::Invalid( + "invalid schema for RangeBitmapFileIndexReader, supposed to have single field."); + } + const auto arrow_type = arrow_schema_ptr->field(0)->type(); + return RangeBitmapFileIndexReader::Create(arrow_type, start, length, input_stream, pool); +} + +Result> RangeBitmapFileIndex::CreateWriter( + ::ArrowSchema* arrow_schema, const std::shared_ptr& pool) const { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_schema_ptr, + arrow::ImportSchema(arrow_schema)); + if (arrow_schema_ptr->num_fields() != 1) { + return Status::Invalid( + "invalid schema for RangeBitmapFileIndexWriter, supposed to have single field."); + } + const auto arrow_field = arrow_schema_ptr->field(0); + return RangeBitmapFileIndexWriter::Create(arrow_schema_ptr, arrow_field->name(), options_, + pool); +} + +Result> RangeBitmapFileIndexWriter::Create( + const std::shared_ptr& arrow_schema, const std::string& field_name, + const std::map& options, const std::shared_ptr& pool) { + const auto field = arrow_schema->GetFieldByName(field_name); + if (!field) { + return Status::Invalid(fmt::format("Field not found in schema: {}", field_name)); + } + PAIMON_ASSIGN_OR_RAISE(FieldType field_type, + FieldTypeUtils::ConvertToFieldType(field->type()->id())); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr shared_key_factory, + KeyFactory::Create(field_type)); + PAIMON_ASSIGN_OR_RAISE(int64_t parsed_chunk_size, + MemorySize::ParseBytes(KeyFactory::DEFAULT_CHUNK_SIZE)); + if (const auto chunk_size_it = options.find(RangeBitmapFileIndex::CHUNK_SIZE); + chunk_size_it != options.end()) { + PAIMON_ASSIGN_OR_RAISE(parsed_chunk_size, MemorySize::ParseBytes(chunk_size_it->second)); + } + if (parsed_chunk_size > std::numeric_limits::max()) { + return Status::Invalid("Chunk size must be less than 4GB"); + } + auto appender_ptr = + std::make_unique(pool, shared_key_factory, parsed_chunk_size); + return std::make_shared( + field->type(), options, pool, shared_key_factory, std::move(appender_ptr)); +} + +Status RangeBitmapFileIndexWriter::AddBatch(::ArrowArray* batch) { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr array, + arrow::ImportArray(batch, arrow_type_)); + PAIMON_ASSIGN_OR_RAISE(std::vector array_values, + LiteralConverter::ConvertLiteralsFromArray(*array, true)); + for (const auto& literal : array_values) { + appender_->Append(literal); + } + return Status::OK(); +} + +Result> RangeBitmapFileIndexWriter::SerializedBytes() const { + return appender_->Serialize(); +} + +RangeBitmapFileIndexWriter::RangeBitmapFileIndexWriter( + const std::shared_ptr& arrow_type, + const std::map& options, const std::shared_ptr& pool, + const std::shared_ptr& key_factory, std::unique_ptr appender) + : arrow_type_(arrow_type), + options_(options), + pool_(pool), + key_factory_(key_factory), + appender_(std::move(appender)) {} + +Result> RangeBitmapFileIndexReader::Create( + const std::shared_ptr& arrow_type, const int32_t start, const int32_t length, + const std::shared_ptr& input_stream, const std::shared_ptr& pool) { + if (!arrow_type || !input_stream || !pool) { + return Status::Invalid("RangeBitmapFileIndexReader::Create: null argument"); + } + PAIMON_ASSIGN_OR_RAISE(FieldType field_type, + FieldTypeUtils::ConvertToFieldType(arrow_type->id())); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr range_bitmap, + RangeBitmap::Create(input_stream, start, field_type, pool)); + return std::shared_ptr( + new RangeBitmapFileIndexReader(std::move(range_bitmap))); +} + +RangeBitmapFileIndexReader::RangeBitmapFileIndexReader(std::unique_ptr range_bitmap) + : range_bitmap_(std::move(range_bitmap)) {} + +Result> RangeBitmapFileIndexReader::VisitEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Eq(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitNotEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Neq(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitIn( + const std::vector& literals) { + return std::make_shared( + [self = shared_from_this(), literals]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->In(literals); + }); +} + +Result> RangeBitmapFileIndexReader::VisitNotIn( + const std::vector& literals) { + return std::make_shared( + [self = shared_from_this(), literals]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->NotIn(literals); + }); +} + +Result> RangeBitmapFileIndexReader::VisitIsNull() { + return std::make_shared( + [self = shared_from_this()]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->IsNull(); + }); +} + +Result> RangeBitmapFileIndexReader::VisitIsNotNull() { + return std::make_shared( + [self = shared_from_this()]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->IsNotNull(); + }); +} + +Result> RangeBitmapFileIndexReader::VisitGreaterThan( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Gt(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitLessThan( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Lt(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitGreaterOrEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Gte(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitLessOrEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + if (!self->range_bitmap_) return RoaringBitmap32(); + return self->range_bitmap_->Lte(literal); + }); +} + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h new file mode 100644 index 00000000..2a557f20 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h @@ -0,0 +1,110 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/bitmap/bitmap_file_index.h" +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" +#include "paimon/file_index/file_index_reader.h" +#include "paimon/file_index/file_index_writer.h" +#include "paimon/file_index/file_indexer.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +class RangeBitmapFileIndexWriter; +class RangeBitmapFileIndexReader; + +class RangeBitmapFileIndex final : public FileIndexer { + public: + explicit RangeBitmapFileIndex(const std::map& options); + + ~RangeBitmapFileIndex() override = default; + + Result> CreateReader( + ::ArrowSchema* arrow_schema, int32_t start, int32_t length, + const std::shared_ptr& input_stream, + const std::shared_ptr& pool) const override; + + Result> CreateWriter( + ::ArrowSchema* arrow_schema, const std::shared_ptr& pool) const override; + + public: + static constexpr char CHUNK_SIZE[] = "chunk-size"; + + private: + std::map options_; +}; + +class RangeBitmapFileIndexWriter final : public FileIndexWriter { + public: + static Result> Create( + const std::shared_ptr& arrow_schema, const std::string& field_name, + const std::map& options, const std::shared_ptr& pool); + + Status AddBatch(::ArrowArray* batch) override; + Result> SerializedBytes() const override; + + RangeBitmapFileIndexWriter(const std::shared_ptr& arrow_type, + const std::map& options, + const std::shared_ptr& pool, + const std::shared_ptr& key_factory, + std::unique_ptr appender); + + public: + static constexpr char DEFAULT_CHUNK_SIZE[] = "16kb"; + + private: + std::shared_ptr arrow_type_; + std::map options_; + std::shared_ptr pool_; + std::shared_ptr key_factory_; + std::unique_ptr appender_; +}; + +class RangeBitmapFileIndexReader final + : public FileIndexReader, + public std::enable_shared_from_this { + public: + static Result> Create( + const std::shared_ptr& arrow_type, int32_t start, int32_t length, + const std::shared_ptr& input_stream, const std::shared_ptr& pool); + + private: + explicit RangeBitmapFileIndexReader(std::unique_ptr range_bitmap); + + Result> VisitEqual(const Literal& literal) override; + Result> VisitNotEqual(const Literal& literal) override; + Result> VisitIn(const std::vector& literals) override; + Result> VisitNotIn( + const std::vector& literals) override; + Result> VisitIsNull() override; + Result> VisitIsNotNull() override; + Result> VisitGreaterThan(const Literal& literal) override; + Result> VisitLessThan(const Literal& literal) override; + Result> VisitGreaterOrEqual(const Literal& literal) override; + Result> VisitLessOrEqual(const Literal& literal) override; + + std::unique_ptr range_bitmap_; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp new file mode 100644 index 00000000..c27736fd --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp @@ -0,0 +1,32 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h" + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" + +namespace paimon { + +RangeBitmapFileIndexFactory::~RangeBitmapFileIndexFactory() = default; + +Result> RangeBitmapFileIndexFactory::Create( + const std::map& options) const { + return std::make_unique(options); +} + +REGISTER_PAIMON_FACTORY(RangeBitmapFileIndexFactory); + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h new file mode 100644 index 00000000..844bd5d2 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h @@ -0,0 +1,41 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/file_index/file_indexer.h" +#include "paimon/file_index/file_indexer_factory.h" +#include "paimon/result.h" + +namespace paimon { + +class PAIMON_EXPORT RangeBitmapFileIndexFactory final : public FileIndexerFactory { + public: + const char* Identifier() const override { + return "range-bitmap"; + } + + ~RangeBitmapFileIndexFactory() override; + + Result> Create( + const std::map& options) const override; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp new file mode 100644 index 00000000..8aa81dae --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp @@ -0,0 +1,482 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" + +#include + +#include + +#include "arrow/api.h" +#include "arrow/c/bridge.h" +#include "paimon/file_index/bitmap_index_result.h" +#include "paimon/file_index/file_index_format.h" +#include "paimon/file_index/file_indexer_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/predicate/literal.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +class RangeBitmapFileIndexTest : public ::testing::Test { + public: + void SetUp() override { + pool_ = GetDefaultPool(); + fs_ = std::make_shared(); + } + + void TearDown() override { + index_buffer_.reset(); + pool_.reset(); + fs_.reset(); + } + + static void CheckResult(const std::shared_ptr& result, + const std::vector& expected) { + const auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap32* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + const RoaringBitmap32 expected_bitmap = RoaringBitmap32::From(expected); + ASSERT_EQ(*bitmap, expected_bitmap) + << "result=" << bitmap->ToString() << ", expected=" << expected_bitmap.ToString(); + } + + protected: + std::shared_ptr pool_; + + private: + std::shared_ptr fs_; + std::shared_ptr index_buffer_; +}; + +// Helper function to create writer, serialize, and create reader +template +Result> CreateReaderForTest( + RangeBitmapFileIndexTest* test, const std::shared_ptr& arrow_type, + const std::vector& test_data, PAIMON_UNIQUE_PTR* serialized_bytes_out) { + return CreateReaderForTest(test, arrow_type, test_data, {}, + serialized_bytes_out); +} + +// Overload with options to exercise writer configuration such as chunk size. +template +Result> CreateReaderForTest( + RangeBitmapFileIndexTest* test, const std::shared_ptr& arrow_type, + const std::vector& test_data, const std::map& options, + PAIMON_UNIQUE_PTR* serialized_bytes_out) { + // Create Arrow array from test data + auto builder = std::make_shared(); + auto status = builder->AppendValues(test_data); + if (!status.ok()) { + return Status::Invalid(fmt::format("Failed to append values: {}", status.ToString())); + } + std::shared_ptr arrow_array; + status = builder->Finish(&arrow_array); + if (!status.ok()) { + return Status::Invalid(fmt::format("Failed to finish builder: {}", status.ToString())); + } + auto c_array = std::make_unique<::ArrowArray>(); + status = arrow::ExportArray(*arrow_array, c_array.get()); + if (!status.ok()) { + return Status::Invalid(fmt::format("Failed to export array: {}", status.ToString())); + } + // Create schema for the field + const auto schema = arrow::schema({arrow::field("test_field", arrow_type)}); + // Create writer + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr writer, + RangeBitmapFileIndexWriter::Create(schema, "test_field", options, test->pool_)); + // Add the batch + PAIMON_RETURN_NOT_OK(writer->AddBatch(c_array.get())); + // Get serialized payload + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR serialized_bytes, writer->SerializedBytes()); + if (!serialized_bytes || serialized_bytes->size() == 0) { + return Status::Invalid("Serialized bytes is empty"); + } + *serialized_bytes_out = std::move(serialized_bytes); + const auto input_stream = std::make_shared( + (*serialized_bytes_out)->data(), (*serialized_bytes_out)->size()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr reader, + RangeBitmapFileIndexReader::Create( + arrow_type, 0, static_cast((*serialized_bytes_out)->size()), + input_stream, test->pool_)); + return reader; +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexMultiChunk) { + // Use many distinct values and a very small chunk size to force multiple + // dictionary chunks when writing the range bitmap index. + std::vector test_data; + test_data.reserve(100); + for (int32_t i = 0; i < 100; ++i) { + test_data.push_back(i); + } + + const auto& arrow_type = arrow::int32(); + std::map options; + // Configure a very small chunk size in bytes so that the dictionary must + // be split into multiple chunks. + options[RangeBitmapFileIndex::CHUNK_SIZE] = "86b"; + + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, + (CreateReaderForTest( + this, arrow_type, test_data, options, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_0_result, reader->VisitEqual(Literal(static_cast(0)))); + CheckResult(eq_0_result, {0}); + + ASSERT_OK_AND_ASSIGN(auto eq_50_result, reader->VisitEqual(Literal(static_cast(50)))); + CheckResult(eq_50_result, {50}); + ASSERT_OK_AND_ASSIGN(auto eq_51_result, reader->VisitEqual(Literal(static_cast(51)))); + CheckResult(eq_51_result, {51}); + ASSERT_OK_AND_ASSIGN(auto eq_99_result, reader->VisitEqual(Literal(static_cast(99)))); + CheckResult(eq_99_result, {99}); + + ASSERT_OK_AND_ASSIGN(auto gt_49_result, + reader->VisitGreaterThan(Literal(static_cast(49)))); + // Positions 50..99 + std::vector expected_gt_49; + expected_gt_49.reserve(50); + for (int32_t i = 50; i < 100; ++i) { + expected_gt_49.push_back(i); + } + CheckResult(gt_49_result, expected_gt_49); + + ASSERT_OK_AND_ASSIGN(auto lt_10_result, + reader->VisitLessThan(Literal(static_cast(10)))); + // Positions 0..9 + std::vector expected_lt_10; + expected_lt_10.reserve(10); + for (int32_t i = 0; i < 10; ++i) { + expected_lt_10.push_back(i); + } + CheckResult(lt_10_result, expected_lt_10); + + // is_not_null should cover all rows. + std::vector all_positions(100); + for (int32_t i = 0; i < 100; ++i) { + all_positions[i] = i; + } + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexBigInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int64(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + + // Test equality queries + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); // positions 0 and 2 have value 10 + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); // positions 1 and 4 have value 20 + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); // position 3 has value 30 + ASSERT_OK_AND_ASSIGN(auto eq_40_result, reader->VisitEqual(Literal(static_cast(40)))); + CheckResult(eq_40_result, {5}); // position 5 has value 40 + ASSERT_OK_AND_ASSIGN(auto eq_50_result, reader->VisitEqual(Literal(static_cast(50)))); + CheckResult(eq_50_result, {6}); // position 6 has value 50 + + // Test range queries + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35: 10, 20, 10, 30, 20 + ASSERT_OK_AND_ASSIGN(auto gte_20_result, + reader->VisitGreaterOrEqual(Literal(static_cast(20)))); + CheckResult(gte_20_result, {1, 3, 4, 5, 6}); // values >= 20: 20, 30, 20, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lte_40_result, + reader->VisitLessOrEqual(Literal(static_cast(40)))); + CheckResult(lte_40_result, {0, 1, 2, 3, 4, 5}); // values <= 40: 10, 20, 10, 30, 20, 40 + + // Test IN queries + std::vector in_values = {Literal(static_cast(10)), + Literal(static_cast(30))}; + ASSERT_OK_AND_ASSIGN(auto in_result, reader->VisitIn(in_values)); + CheckResult(in_result, {0, 2, 3}); // positions with values 10 or 30 + ASSERT_OK_AND_ASSIGN(auto not_in_result, reader->VisitNotIn(in_values)); + CheckResult(not_in_result, {1, 4, 5, 6}); // positions with values NOT 10 or 30 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); // no null values + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); // all positions are not null +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + + // Test equality queries + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); + + // Test range queries + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); + ASSERT_OK_AND_ASSIGN(auto gte_20_result, + reader->VisitGreaterOrEqual(Literal(static_cast(20)))); + CheckResult(gte_20_result, {1, 3, 4, 5, 6}); + ASSERT_OK_AND_ASSIGN(auto lte_40_result, + reader->VisitLessOrEqual(Literal(static_cast(40)))) + CheckResult(lte_40_result, {0, 1, 2, 3, 4, 5}); + + // Test empty result cases for INT values that don't exist + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_int_result, + reader->VisitEqual(Literal(static_cast(25)))); + CheckResult(eq_nonexistent_int_result, {}); // 25 doesn't exist in data {10,20,30,40,50} + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_high_int_result, + reader->VisitEqual(Literal(static_cast(100)))); + CheckResult(eq_out_of_range_high_int_result, {}); // Value above maximum (50) + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_low_int_result, + reader->VisitEqual(Literal(static_cast(5)))); + CheckResult(eq_out_of_range_low_int_result, {}); // Value below minimum (10) + + // Test NotEqual operations + ASSERT_OK_AND_ASSIGN(auto ne_10_result, + reader->VisitNotEqual(Literal(static_cast(10)))); + CheckResult(ne_10_result, {1, 3, 4, 5, 6}); // All positions except {0, 2} where 10 appears + + ASSERT_OK_AND_ASSIGN(auto ne_nonexistent_result, + reader->VisitNotEqual(Literal(static_cast(99)))); + CheckResult(ne_nonexistent_result, {0, 1, 2, 3, 4, 5, 6}); // All positions (non-empty result) + + // Test NotIn operations + ASSERT_OK_AND_ASSIGN(auto not_in_single_result, + reader->VisitNotIn({Literal(static_cast(10))})); + CheckResult(not_in_single_result, {1, 3, 4, 5, 6}); // All positions except where 10 appears + + ASSERT_OK_AND_ASSIGN( + auto not_in_multiple_result, + reader->VisitNotIn({Literal(static_cast(10)), Literal(static_cast(20))})); + CheckResult(not_in_multiple_result, {3, 5, 6}); // Positions not containing 10 or 20 + + ASSERT_OK_AND_ASSIGN(auto not_in_nonexistent_result, + reader->VisitNotIn({Literal(static_cast(99))})); + CheckResult(not_in_nonexistent_result, + {0, 1, 2, 3, 4, 5, 6}); // All positions (non-empty result) + + // Test NotIn with empty result - all values are NOT IN the complete set + std::vector all_values = { + Literal(static_cast(10)), Literal(static_cast(20)), + Literal(static_cast(30)), Literal(static_cast(40)), + Literal(static_cast(50))}; + ASSERT_OK_AND_ASSIGN(auto not_in_all_result, reader->VisitNotIn(all_values)); + CheckResult(not_in_all_result, + {}); // Empty result - no positions left when excluding all existing values +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexSmallInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int16(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexTinyInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int8(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexBoolean) { + std::vector test_data = {true, false, true, true, false, true, false}; + const auto& arrow_type = arrow::boolean(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_true_result, reader->VisitEqual(Literal(true))); + CheckResult(eq_true_result, {0, 2, 3, 5}); // positions with value true + ASSERT_OK_AND_ASSIGN(auto eq_false_result, reader->VisitEqual(Literal(false))); + CheckResult(eq_false_result, {1, 4, 6}); // positions with value false + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexFloat) { + std::vector test_data = {10.5f, 20.3f, 10.5f, 30.7f, 20.3f, 40.1f, 50.9f}; + const auto& arrow_type = arrow::float32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_5_result, reader->VisitEqual(Literal(10.5f))); + CheckResult(eq_10_5_result, {0, 2}); // positions with value 10.5 + ASSERT_OK_AND_ASSIGN(auto eq_20_3_result, reader->VisitEqual(Literal(20.3f))); + CheckResult(eq_20_3_result, {1, 4}); // positions with value 20.3 + ASSERT_OK_AND_ASSIGN(auto eq_30_7_result, reader->VisitEqual(Literal(30.7f))); + CheckResult(eq_30_7_result, {3}); // position with value 30.7 + ASSERT_OK_AND_ASSIGN(auto gt_24_9_result, reader->VisitGreaterThan(Literal(24.9f))); + CheckResult(gt_24_9_result, {3, 5, 6}); // values > 25.0: 30.7, 40.1, 50.9 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, reader->VisitLessThan(Literal(35.0f))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35.0 + + // Test empty result cases for float values that don't exist + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_float_result, reader->VisitEqual(Literal(25.0f))); + CheckResult(eq_nonexistent_float_result, {}); // 25.0 doesn't exist in data + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_high_result, reader->VisitEqual(Literal(100.0f))); + CheckResult(eq_out_of_range_high_result, {}); // Value above maximum + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_low_result, reader->VisitEqual(Literal(5.0f))); + CheckResult(eq_out_of_range_low_result, {}); // Value below minimum + + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexDouble) { + std::vector test_data = {10.5, 20.3, 10.5, 30.7, 20.3, 40.1, 50.9}; + const auto& arrow_type = arrow::float64(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_5_result, reader->VisitEqual(Literal(10.5))); + CheckResult(eq_10_5_result, {0, 2}); // positions with value 10.5 + ASSERT_OK_AND_ASSIGN(auto eq_20_3_result, reader->VisitEqual(Literal(20.3))); + CheckResult(eq_20_3_result, {1, 4}); // positions with value 20.3 + ASSERT_OK_AND_ASSIGN(auto eq_30_7_result, reader->VisitEqual(Literal(30.7))); + CheckResult(eq_30_7_result, {3}); // position with value 30.7 + ASSERT_OK_AND_ASSIGN(auto gt_24_9_result, reader->VisitGreaterThan(Literal(24.9))); + CheckResult(gt_24_9_result, {3, 5, 6}); // values > 25.0: 30.7, 40.1, 50.9 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, reader->VisitLessThan(Literal(35.0))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35.0 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexDate) { + std::vector test_data = {42432, 24649, 42432, 38001, 24649, 50000, 12000}; + const auto& arrow_type = arrow::date32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + this, arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_42432_result, reader->VisitEqual(Literal(FieldType::DATE, 42432))); + CheckResult(eq_42432_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_24649_result, reader->VisitEqual(Literal(FieldType::DATE, 24649))); + CheckResult(eq_24649_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_38001_result, reader->VisitEqual(Literal(FieldType::DATE, 38001))); + CheckResult(eq_38001_result, {3}); + ASSERT_OK_AND_ASSIGN(auto gt_result, + reader->VisitGreaterOrEqual(Literal(FieldType::DATE, 30000))); + CheckResult(gt_result, {0, 2, 3, 5}); // 42432, 38001, 50000 + + ASSERT_OK_AND_ASSIGN(auto lt_result, reader->VisitLessThan(Literal(FieldType::DATE, 40000))); + CheckResult(lt_result, {1, 3, 4, 6}); // 24649, 38001, 12000 + + // Test empty result cases - values that don't exist in the data + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_low_result, + reader->VisitEqual(Literal(FieldType::DATE, 47432))); + CheckResult(eq_nonexistent_low_result, {}); + + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_mid_result, + reader->VisitEqual(Literal(FieldType::DATE, 30000))); + CheckResult(eq_nonexistent_mid_result, {}); // Value in middle range but doesn't exist + + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_high_result, + reader->VisitEqual(Literal(FieldType::DATE, 60000))); + CheckResult(eq_nonexistent_high_result, {}); // Value above maximum (50000) + + // Test range queries that should return empty results + ASSERT_OK_AND_ASSIGN(auto gt_all_result, + reader->VisitGreaterOrEqual(Literal(FieldType::DATE, 60000))); + CheckResult(gt_all_result, {}); // Greater than maximum should return empty + + ASSERT_OK_AND_ASSIGN(auto lt_all_result, + reader->VisitLessThan(Literal(FieldType::DATE, 10000))); + CheckResult(lt_all_result, {}); // Less than minimum should return empty + + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +} // namespace paimon::test diff --git a/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp new file mode 100644 index 00000000..bada5916 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.cpp @@ -0,0 +1,220 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" + +#include "fmt/format.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/memory/bytes.h" + +namespace paimon { + +Result LiteralSerDeUtils::CreateValueWriter( + const FieldType field_type) { + switch (field_type) { + case FieldType::BOOLEAN: + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::TINYINT: + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::SMALLINT: + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::DATE: + case FieldType::INT: + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::BIGINT: + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::FLOAT: + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::DOUBLE: + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { + output_stream->WriteValue(literal.GetValue()); + return Status::OK(); + }); + case FieldType::STRING: { + return LiteralSerDeUtils::Serializer( + [](const std::shared_ptr& output_stream, + const Literal& literal) -> Status { + const auto value = literal.GetValue(); + output_stream->WriteValue(static_cast(value.size())); + output_stream->Write(value.data(), value.size()); + return Status::OK(); + }); + } + default: + return Status::Invalid( + fmt::format("Unsupported field type for literal serialization: {}", + FieldTypeUtils::FieldTypeToString(field_type))); + } +} + +Result LiteralSerDeUtils::CreateValueReader(FieldType field_type) { + switch (field_type) { + case FieldType::BOOLEAN: { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(bool value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::TINYINT: { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int8_t value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::SMALLINT: { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int16_t value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::DATE: { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int32_t value, input_stream->ReadValue()); + return Literal(FieldType::DATE, value); + }); + } + case FieldType::INT: { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int32_t value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::BIGINT: { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int64_t value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::FLOAT: { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(float value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::DOUBLE: { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(double value, input_stream->ReadValue()); + return Literal(value); + }); + } + case FieldType::STRING: { + return LiteralSerDeUtils::Deserializer( + [](const std::shared_ptr& input_stream, + MemoryPool* pool) -> Result { + PAIMON_ASSIGN_OR_RAISE(int32_t length, input_stream->ReadValue()); + auto bytes = Bytes::AllocateBytes(length, pool); + PAIMON_RETURN_NOT_OK(input_stream->ReadBytes(bytes.get())); + return Literal(FieldType::STRING, bytes->data(), bytes->size()); + }); + } + default: + return Status::Invalid( + fmt::format("Unsupported field type for literal deserialization: {}", + FieldTypeUtils::FieldTypeToString(field_type))); + } +} + +Result LiteralSerDeUtils::GetFixedFieldSize(const FieldType& field_type) { + switch (field_type) { + case FieldType::BOOLEAN: + case FieldType::TINYINT: + return sizeof(int8_t); + case FieldType::SMALLINT: + return sizeof(int16_t); + case FieldType::DATE: + case FieldType::INT: + return sizeof(int32_t); + case FieldType::BIGINT: + return sizeof(int64_t); + case FieldType::FLOAT: + return sizeof(float); + case FieldType::DOUBLE: + return sizeof(double); + default: + return Status::Invalid(fmt::format("Unsupported field type for GetFixedFieldSize: {}", + FieldTypeUtils::FieldTypeToString(field_type))); + } +} + +Result LiteralSerDeUtils::GetSerializedSizeInBytes(const Literal& literal) { + switch (literal.GetType()) { + case FieldType::BOOLEAN: + case FieldType::TINYINT: + case FieldType::SMALLINT: + case FieldType::DATE: + case FieldType::INT: + case FieldType::BIGINT: + case FieldType::DOUBLE: + case FieldType::FLOAT: + return GetFixedFieldSize(literal.GetType()); + case FieldType::STRING: + return static_cast(sizeof(int32_t) + literal.GetValue().size()); + default: + return Status::Invalid( + fmt::format("Unsupported field type for GetSerializedSizeInBytes: {}", + FieldTypeUtils::FieldTypeToString(literal.GetType()))); + } +} + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h new file mode 100644 index 00000000..f7e7bbb5 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h @@ -0,0 +1,50 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +class LiteralSerDeUtils { + public: + LiteralSerDeUtils() = delete; + + ~LiteralSerDeUtils() = delete; + + using Serializer = + std::function&, const Literal&)>; + using Deserializer = std::function( + const std::shared_ptr& input_stream, MemoryPool* pool)>; + + static Result CreateValueReader(FieldType field_type); + + static Result CreateValueWriter(FieldType field_type); + + static Result GetFixedFieldSize(const FieldType& field_type); + + static Result GetSerializedSizeInBytes(const Literal& literal); +}; + +} // namespace paimon diff --git a/src/paimon/common/io/data_input_stream.cpp b/src/paimon/common/io/data_input_stream.cpp index 32df0254..44fa2f5c 100644 --- a/src/paimon/common/io/data_input_stream.cpp +++ b/src/paimon/common/io/data_input_stream.cpp @@ -122,4 +122,5 @@ template Result DataInputStream::ReadValue() const; template Result DataInputStream::ReadValue() const; template Result DataInputStream::ReadValue() const; template Result DataInputStream::ReadValue() const; +template Result DataInputStream::ReadValue() const; } // namespace paimon diff --git a/test/inte/read_inte_with_index_test.cpp b/test/inte/read_inte_with_index_test.cpp index bd26f187..957d62b9 100644 --- a/test/inte/read_inte_with_index_test.cpp +++ b/test/inte/read_inte_with_index_test.cpp @@ -45,6 +45,7 @@ #include "paimon/data/timestamp.h" #include "paimon/defs.h" #include "paimon/factories/factory_creator.h" +#include "paimon/fs/local/local_file_system.h" #include "paimon/memory/bytes.h" #include "paimon/memory/memory_pool.h" #include "paimon/metrics.h" @@ -372,6 +373,295 @@ class ReadInteWithIndexTest : public testing::Test, } } + void CheckResultForRangeBitmap(const std::string& path, + const std::shared_ptr& arrow_data_type, + const std::shared_ptr& split) const { + { + // test with no predicate - return all 8 rows + std::shared_ptr expected_array; + auto array_status = arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, + { + R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 3, 200, 2.2, 22.22, 19725, "row1"] +])", + R"([ +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"] +])", + R"([ +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, null, null, null, null, null, null] +])", + R"([ +[0, null, null, null, null, null, "null_row"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + std::cout << array_status.message() << std::endl; + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, /*predicate=*/nullptr, expected_array); + } + { + // Test equal predicate: f0 = 17 -> row 0 + auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(17)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test less than predicate: f0 < 10 -> rows 1,2,3,4 (values 3,5,7,9) + auto predicate = PredicateBuilder::LessThan(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(10)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test greater than predicate: f0 > 5 -> rows 0,3,4,7 (values 17,7,9,10) + auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(5)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test is null predicate on f0 -> rows 5, 6 + auto predicate = + PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::INT); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, null, null, null, null, null, null], +[0, null, null, null, null, null, "null_row"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test is not null predicate on f0 -> rows 0,1,2,3,4,7 + auto predicate = + PredicateBuilder::IsNotNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::INT); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test in predicate: f0 in (3, 7) -> rows 1, 3 + auto predicate = PredicateBuilder::In( + /*field_index=*/0, /*field_name=*/"f0", FieldType::INT, {Literal(3), Literal(7)}); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test not in predicate: f0 not in (3, 7) -> rows 0,2,4,7 (excluding null rows 5,6) + auto predicate = PredicateBuilder::NotIn( + /*field_index=*/0, /*field_name=*/"f0", FieldType::INT, {Literal(3), Literal(7)}); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] + ])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test f1 (BIGINT) predicates + { + // Test greater than predicate: f1 > 300 -> rows 3,4,7 (values 400,500,600) + auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", + FieldType::BIGINT, Literal(300L)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test f2 (FLOAT) predicates + { + // Test less than predicate: f2 < 4.0 -> rows 0,1,2 (values 1.1,2.2,3.3) + auto predicate = PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2", + FieldType::FLOAT, Literal(4.0f)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + // Test date type + { + // Test greater than predicate: f0 > 5 -> rows 0,3,4,7 (values 17,7,9,10) + auto predicate = + PredicateBuilder::LessOrEqual(/*field_index=*/4, /*field_name=*/"f4", + FieldType::DATE, Literal(FieldType::DATE, 19725)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test f3 (DOUBLE) predicates + { + // Test greater or equal predicate: f3 >= 40.0 -> rows 3,4,7 (values 44.44,55.55,66.66) + auto predicate = PredicateBuilder::GreaterOrEqual( + /*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, Literal(44.44)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test BETWEEN predicate on f1 (BIGINT) + { + // Test f1 BETWEEN 200 AND 500 -> rows 1,2,3,4 (values 200,300,400,500) + auto predicate = + PredicateBuilder::Between(/*field_index=*/1, /*field_name=*/"f1", FieldType::BIGINT, + Literal(200L), Literal(500L)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test IN predicate on f2 (FLOAT) + { + // Test f2 IN (1.1, 4.4, 6.6) -> rows 0,3,7 (values 1.1,4.4,6.6) + auto predicate = + PredicateBuilder::In(/*field_index=*/2, /*field_name=*/"f2", FieldType::FLOAT, + {Literal(1.1f), Literal(4.4f), Literal(6.6f)}); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test nested composite: (f0 = 3 OR f0 = 17) AND f1 < 200 + // (f0 = 3 OR f0 = 17): matches rows 0,1 + // f1 < 200: matches rows 0 (f1=100) + // Combined AND: matches rows 0 + auto predicate1 = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(3)); + auto predicate2 = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(17)); + ASSERT_OK_AND_ASSIGN(auto or_predicate, PredicateBuilder::Or({predicate1, predicate2})); + + auto predicate3 = PredicateBuilder::LessThan(/*field_index=*/1, /*field_name=*/"f1", + FieldType::BIGINT, Literal(200L)); + ASSERT_OK_AND_ASSIGN(auto and_predicate, + PredicateBuilder::And({or_predicate, predicate3})); + + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, and_predicate, expected_array); + } + { + // Test AND predicate with mixed types: f0 >= 5 AND f1 > 100 + // f0 >= 5: matches rows 3,4,7 + // f1 > 100: matches rows 2,3,4,7 + // Combined AND: matches rows 3,4,7 + auto predicate1 = PredicateBuilder::GreaterThan(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(5)); + auto predicate2 = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", + FieldType::BIGINT, Literal(100L)); + ASSERT_OK_AND_ASSIGN(auto and_predicate, + PredicateBuilder::And({predicate1, predicate2})); + + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, and_predicate, expected_array); + } + } + void CheckResultForBsi(const std::string& path, const std::shared_ptr& arrow_data_type, const std::shared_ptr split) const { @@ -2072,6 +2362,95 @@ TEST_P(ReadInteWithIndexTest, TestWithIndexWithoutRegistered) { } } +TEST_P(ReadInteWithIndexTest, TestRangeBitmapIndex) { + auto [file_format, enable_prefetch] = GetParam(); + std::string path = + GetDataDir() + file_format + "/append_with_rangebitmap.db/append_with_rangebitmap/"; + std::string file_name; + if (file_format == "orc") { + file_name = "data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc"; + } else if (file_format == "parquet") { + file_name = "data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet"; + } + + std::vector read_fields = {SpecialFields::ValueKind(), + DataField(0, arrow::field("f0", arrow::int32())), + DataField(1, arrow::field("f1", arrow::int64())), + DataField(2, arrow::field("f2", arrow::float32())), + DataField(3, arrow::field("f3", arrow::float64())), + DataField(4, arrow::field("f4", arrow::date32())), + DataField(5, arrow::field("f5", arrow::utf8()))}; + std::shared_ptr arrow_data_type = + DataField::ConvertDataFieldsToArrowStructType(read_fields); + + auto data_file_meta = std::make_shared( + file_name, /*file_size=*/1288, + /*row_count=*/8, /*min_key=*/BinaryRow::EmptyRow(), + /*max_key=*/BinaryRow::EmptyRow(), /*key_stats=*/SimpleStats::EmptyStats(), + /*value_stats=*/SimpleStats::EmptyStats(), /*min_sequence_number=*/0, + /*max_sequence_number=*/7, /*schema_id=*/0, + /*level=*/0, + /*extra_files=*/ + std::vector>({file_name + ".index"}), + /*creation_time=*/Timestamp(0ll, 0), /*delete_row_count=*/0, + /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); + + DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, + /*bucket_path=*/path + "bucket-0/", {data_file_meta}); + ASSERT_OK_AND_ASSIGN(auto split, + builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); + + // Run comprehensive range bitmap index tests + CheckResultForRangeBitmap(path, arrow_data_type, split); +} + +TEST_P(ReadInteWithIndexTest, TestRangeBitmapIndexMultiChunk) { + auto [file_format, enable_prefetch] = GetParam(); + std::string path = + GetDataDir() + file_format + + "/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/"; + std::string file_name; + if (file_format == "orc") { + file_name = "data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc"; + } else if (file_format == "parquet") { + file_name = "data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet"; + } + + std::vector read_fields = {SpecialFields::ValueKind(), + DataField(0, arrow::field("f0", arrow::int32())), + DataField(1, arrow::field("f1", arrow::int64())), + DataField(2, arrow::field("f2", arrow::float32())), + DataField(3, arrow::field("f3", arrow::float64())), + DataField(4, arrow::field("f4", arrow::date32())), + DataField(5, arrow::field("f5", arrow::utf8()))}; + std::shared_ptr arrow_data_type = + DataField::ConvertDataFieldsToArrowStructType(read_fields); + + auto data_file_meta = std::make_shared( + file_name, /*file_size=*/1413, + /*row_count=*/8, /*min_key=*/BinaryRow::EmptyRow(), + /*max_key=*/BinaryRow::EmptyRow(), /*key_stats=*/SimpleStats::EmptyStats(), + /*value_stats=*/SimpleStats::EmptyStats(), /*min_sequence_number=*/0, + /*max_sequence_number=*/7, /*schema_id=*/0, + /*level=*/0, + /*extra_files=*/ + std::vector>({file_name + ".index"}), + /*creation_time=*/Timestamp(0ll, 0), /*delete_row_count=*/0, + /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); + + DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, + /*bucket_path=*/path + "bucket-0/", {data_file_meta}); + ASSERT_OK_AND_ASSIGN(auto split, + builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); + + // Run range bitmap index tests with multi-chunk test data + CheckResultForRangeBitmap(path, arrow_data_type, split); +} + TEST_P(ReadInteWithIndexTest, TestWithIOException) { auto [file_format, enable_prefetch] = GetParam(); std::string path = GetDataDir() + "/" + file_format + diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README new file mode 100644 index 00000000..52eb2755 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README @@ -0,0 +1,16 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc new file mode 100644 index 00000000..fe2d4db1 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc.index b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc.index new file mode 100644 index 00000000..18ea5e60 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc.index differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 new file mode 100644 index 00000000..854ee21b Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0 new file mode 100644 index 00000000..4786f9ea Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0 differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-1 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-1 new file mode 100644 index 00000000..a8d36f53 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-1 differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 new file mode 100644 index 00000000..0d24cba7 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 @@ -0,0 +1,39 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file.format" : "orc", + "file-index.in-manifest-threshold" : "1B" + }, + "timeMillis" : 1772177550729 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 new file mode 100644 index 00000000..e30127af --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-64353b80-fb7c-470e-972a-07d0717af717-1", + "deltaManifestListSize" : 1113, + "commitUser" : "67cff790-9276-4301-aa3c-469a17418ac9", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772177555995, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README new file mode 100644 index 00000000..f308976e --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README @@ -0,0 +1,17 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 +range-bitmap index chunk-size: 16B + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc new file mode 100644 index 00000000..fe2d4db1 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc.index b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc.index new file mode 100644 index 00000000..343734e7 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc.index differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-71437504-f8ad-4b7d-be04-28203480227d-0 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-71437504-f8ad-4b7d-be04-28203480227d-0 new file mode 100644 index 00000000..a78f32bf Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-71437504-f8ad-4b7d-be04-28203480227d-0 differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0 new file mode 100644 index 00000000..6e7d09e9 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0 differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1 new file mode 100644 index 00000000..156d6840 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1 differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 new file mode 100644 index 00000000..096f0300 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 @@ -0,0 +1,44 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file-index.range-bitmap.f1.chunk-size" : "16B", + "file-index.range-bitmap.f2.chunk-size" : "16B", + "file-index.range-bitmap.f3.chunk-size" : "16B", + "file-index.in-manifest-threshold" : "1B", + "file-index.range-bitmap.f4.chunk-size" : "16B", + "file.format" : "orc", + "file-index.range-bitmap.f0.chunk-size" : "16B" + }, + "timeMillis" : 1772188734852 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 new file mode 100644 index 00000000..1e8a9f72 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1", + "deltaManifestListSize" : 1106, + "commitUser" : "162120aa-5242-438d-bb0e-96ee933b3313", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772188737678, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README new file mode 100644 index 00000000..52eb2755 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README @@ -0,0 +1,16 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet new file mode 100644 index 00000000..19fabdca Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet.index b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet.index new file mode 100644 index 00000000..18ea5e60 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet.index differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-75f07296-b729-48db-aadd-17826f0aadf9-0 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-75f07296-b729-48db-aadd-17826f0aadf9-0 new file mode 100644 index 00000000..c9299ed9 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-75f07296-b729-48db-aadd-17826f0aadf9-0 differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0 new file mode 100644 index 00000000..a22d455b Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0 differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1 new file mode 100644 index 00000000..49e0cd75 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1 differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 new file mode 100644 index 00000000..cdeb25d1 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 @@ -0,0 +1,38 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file-index.in-manifest-threshold" : "1B" + }, + "timeMillis" : 1772163669686 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 new file mode 100644 index 00000000..fee41027 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1", + "deltaManifestListSize" : 1108, + "commitUser" : "95859ce1-495d-4176-8f68-f7fbd595554c", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772163672630, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README new file mode 100644 index 00000000..f308976e --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README @@ -0,0 +1,17 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 +range-bitmap index chunk-size: 16B + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet new file mode 100644 index 00000000..67d9aea9 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet.index b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet.index new file mode 100644 index 00000000..343734e7 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet.index differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-90122746-0b4c-4328-8a04-576ee6b4cb83-0 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-90122746-0b4c-4328-8a04-576ee6b4cb83-0 new file mode 100644 index 00000000..37270a28 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-90122746-0b4c-4328-8a04-576ee6b4cb83-0 differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 new file mode 100644 index 00000000..808abfa5 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 new file mode 100644 index 00000000..d367884c Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 new file mode 100644 index 00000000..d4ca2df4 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 @@ -0,0 +1,43 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file-index.range-bitmap.f1.chunk-size" : "16B", + "file-index.range-bitmap.f2.chunk-size" : "16B", + "file-index.range-bitmap.f3.chunk-size" : "16B", + "file-index.in-manifest-threshold" : "1B", + "file-index.range-bitmap.f4.chunk-size" : "16B", + "file-index.range-bitmap.f0.chunk-size" : "16B" + }, + "timeMillis" : 1772188209180 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 new file mode 100644 index 00000000..4e78d6b5 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1", + "deltaManifestListSize" : 1108, + "commitUser" : "9385bcac-276c-4639-b825-52623beb2a6d", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772188213862, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file