From b307c9765c6fa8d88d8c2fcf9de44eec5a7765d6 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Mon, 2 Feb 2026 22:25:18 +0800
Subject: [PATCH 01/28] add buffer pool & spsc_queue

---
 src/ailego/buffer/buffer_manager.cc          |   1 +
 src/include/zvec/ailego/buffer/buffer_pool.h | 311 +++++++++++++++++++
 2 files changed, 312 insertions(+)
 create mode 100644 src/include/zvec/ailego/buffer/buffer_pool.h
diff --git a/src/ailego/buffer/buffer_manager.cc b/src/ailego/buffer/buffer_manager.cc
index ac2945b0..307e80ce 100644
--- a/src/ailego/buffer/buffer_manager.cc
+++ b/src/ailego/buffer/buffer_manager.cc
@@ -20,6 +20,7 @@
 #include <zvec/ailego/buffer/buffer_manager.h>
 #include <zvec/ailego/internal/platform.h>
 #include <zvec/ailego/logger/logger.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
 
 #ifdef __clang__
 #pragma clang diagnostic push
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
new file mode 100644
index 00000000..5a09abfa
--- /dev/null
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -0,0 +1,311 @@
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <map>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <stdexcept>
+#include <limits>
+#include <iostream>
+#include <boost/lockfree/spsc_queue.hpp>
+
+using block_id_t = int;
+
+#define BLOCK_SIZE (4 * 1024 * 1024)  // 2 MB
+#define BLOCK_MASK (BLOCK_SIZE - 1)
+#define BLOCK_ID(offset) (offset >> 22)
+#define BLOCK_OFFSET(offset) (offset & BLOCK_MASK)
+
+class LRUCache {
+    boost::lockfree::spsc_queue<int, boost::lockfree::capacity<1024>> q;
+};
+
+class LPMap {
+    struct Entry {
+        std::atomic<int> ref_count;
+        char* buffer;
+    };
+
+  public:
+    LPMap() : entry_num_(0), entries_(nullptr) {}
+    ~LPMap() {
+        delete[] entries_;
+    }
+
+    void init(size_t entry_num) {
+        if (entries_) {
+            delete[] entries_;
+        }
+        entry_num_ = entry_num;
+        entries_ = new Entry[entry_num_];
+        for (size_t i = 0; i < entry_num_; i++) {
+            // entries_[i].ref_count.store(0);
+            entries_[i].ref_count.store(std::numeric_limits<int>::min());
+            entries_[i].buffer = nullptr;
+        }
+    }
+
+    char* acquire_block(block_id_t block_id) {
+        assert(block_id < entry_num_);
+        Entry& entry = entries_[block_id];
+        int rc = entry.ref_count.fetch_add(1);
+        if (rc < 0) {
+            return nullptr;
+        }
+        return entry.buffer;
+    }
+
+    void release_block(block_id_t block_id) {
+        assert(block_id < entry_num_);
+        Entry& entry = entries_[block_id];
+        int rc = entry.ref_count.fetch_sub(1);
+        assert(rc > 0);
+    }
+
+    // need be called under lock
+    char* evict_block(block_id_t block_id) {
+        assert(block_id < entry_num_);
+        Entry& entry = entries_[block_id];
+        int expected = 0;
+        if (entry.ref_count.compare_exchange_strong(expected, std::numeric_limits<int>::min())) {
+            char* buffer = entry.buffer;
+            entry.buffer = nullptr;
+            return buffer;
+        } else {
+            return nullptr;
+        }
+    }
+
+    // need be called under lock
+    char* set_block_acquired(block_id_t block_id, char* buffer) {
+        // std::cout << "Set block " << block_id << std::endl;
+        assert(block_id < entry_num_);
+        Entry& entry = entries_[block_id];
+        if (entry.ref_count.load() >= 0) {
+            entry.ref_count.fetch_add(1);
+            return entry.buffer;
+        }
+        entry.buffer = buffer;
+        entry.ref_count.store(1);
+        return buffer;
+    }
+
+    // need be called under lock
+    void recycle(std::queue<char*>& free_buffers) {
+        for (size_t i = 0; i < entry_num_; i++) {
+            Entry& entry = entries_[i];
+            if (entry.ref_count.load() == 0) {
+                char* buffer = evict_block(i);
+                if (buffer) {
+                    free_buffers.push(buffer);
+                }
+            }
+        }
+    }
+
+    size_t entry_num() const {
+        return entry_num_;
+    }
+
+  private:
+    Entry* entries_;
+    size_t entry_num_;
+};
+
+class BufferPool;
+
+struct BufferPoolHandle {
+    BufferPoolHandle(BufferPool& pool);
+    BufferPoolHandle(BufferPoolHandle&& other) : pool(other.pool), local_cache(std::move(other.local_cache)), hit_num_(other.hit_num_) {
+        other.local_cache.clear();
+        other.hit_num_ = 0;
+    }
+    ~BufferPoolHandle();
+
+    char* get_block(size_t offset, size_t size);
+
+    void release_all();
+
+    BufferPool& pool;
+#ifdef USE_LOCAL_CACHE
+    // std::unordered_map<block_id_t, char*> local_cache;
+    phmap::flat_hash_map<block_id_t, char*> local_cache;
+#else
+    std::vector<block_id_t> local_cache;
+#endif
+    int hit_num_;
+};
+
+class BufferPool {
+  public:
+    BufferPool(const std::string& filename, size_t pool_capacity) : pool_capacity_(pool_capacity){
+        fd_ = open(filename.c_str(), O_RDONLY);
+        if (fd_ < 0) {
+            throw std::runtime_error("Failed to open file: " + filename);
+        }
+        struct stat st;
+        if (fstat(fd_, &st) < 0) {
+            throw std::runtime_error("Failed to stat file: " + filename);
+        }
+        file_size_ = st.st_size;
+        lp_map_.init((file_size_ + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+        size_t buffer_num = pool_capacity_ / BLOCK_SIZE;
+        for (size_t i = 0; i < buffer_num; i++) {
+            char* buffer = (char*)aligned_alloc(64, BLOCK_SIZE);
+            free_buffers_.push(buffer);
+        }
+        std::cout << "buffer_num: " << buffer_num << std::endl;
+        std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+    }
+    ~BufferPool() {
+        close(fd_);
+    }
+
+    BufferPoolHandle get_handle() {
+        return BufferPoolHandle(*this);
+    }
+
+    char* acquire_buffer(block_id_t block_id, int retry = 0) {
+        char* buffer = lp_map_.acquire_block(block_id);
+        if (buffer) {
+            return buffer;
+        }
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            if (free_buffers_.empty()) {
+                for (int i = 0; i < retry; i++) {
+                    lp_map_.recycle(free_buffers_);
+                    if (!free_buffers_.empty()) {
+                        break;
+                    }
+                }
+            }
+            if (free_buffers_.empty()) {
+                return nullptr;
+            }
+            buffer = free_buffers_.front();
+            free_buffers_.pop();
+        }
+        size_t read_offset = static_cast<size_t>(block_id) * BLOCK_SIZE;
+        size_t to_read = std::min<size_t>(BLOCK_SIZE, file_size_ - read_offset);
+
+        ssize_t read_bytes = pread(fd_, buffer, to_read, read_offset);
+        if (read_bytes != static_cast<ssize_t>(to_read)) {
+            std::cerr << "Failed to read file at offset " << read_offset << std::endl;
+            exit(-1);
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            char* placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
+            if (placed_buffer != buffer) {
+                // another thread has set the block
+                free_buffers_.push(buffer);
+            }
+            return placed_buffer;
+        }
+    }
+
+    size_t file_size() const {
+        return file_size_;
+    }
+
+  private:
+    int fd_;
+    size_t file_size_;
+    size_t pool_capacity_;
+
+  public:
+    LPMap lp_map_;
+
+  private:
+    std::mutex mutex_;
+    std::queue<char*> free_buffers_;
+};
+
+
+struct Counter {
+    ~Counter() = default;
+
+    static Counter& get_instance() {
+        static Counter instance;
+        return instance;
+    }
+
+    void record(const std::string& name, int64_t value) {
+        auto it = static_counters.find(name);
+        if (it == static_counters.end()) {
+            auto counter = std::make_unique<std::atomic<int64_t>>(0);
+            it = static_counters.emplace(name, std::move(counter)).first;
+        }
+        it->second->fetch_add(value);
+    }
+
+    void display() {
+        for (const auto& pair : static_counters) {
+            std::cout << pair.first << ": " << pair.second->load() << std::endl;
+        }
+    }
+
+    void clear() {
+        static_counters.clear();
+    }
+
+  private:
+    Counter() {}
+    std::map<std::string, std::unique_ptr<std::atomic<int64_t>>> static_counters;
+};
+
+BufferPoolHandle::BufferPoolHandle(BufferPool& pool) : pool(pool), hit_num_(0) {}
+BufferPoolHandle::~BufferPoolHandle() {
+    Counter::get_instance().record("buffer_pool_handle_hit_num", hit_num_);
+    release_all();
+}
+
+char* BufferPoolHandle::get_block(size_t offset, size_t size) {
+    block_id_t block_id = BLOCK_ID(offset);
+    assert(block_id == BLOCK_ID(offset + size - 1));
+#ifdef USE_LOCAL_CACHE
+    auto it = local_cache.find(block_id);
+    if (it != local_cache.end()) {
+        hit_num_++;
+        return it->second + BLOCK_OFFSET(offset);
+    }
+#endif
+
+    char* buffer = pool.acquire_buffer(block_id, 3);
+    if (buffer) {
+#ifdef USE_LOCAL_CACHE
+        local_cache[block_id] = buffer;
+#else
+        local_cache.push_back(block_id);
+#endif
+        return buffer + BLOCK_OFFSET(offset);
+    }
+
+    return nullptr;
+}
+
+void BufferPoolHandle::release_all() {
+#ifdef USE_LOCAL_CACHE
+    Counter::get_instance().record("buffer_pool_handle_release_call", local_cache.size());
+    for (const auto& pair : local_cache) {
+        pool.lp_map_.release_block(pair.first);
+    }
+#else
+    for (block_id_t block_id : local_cache) {
+        pool.lp_map_.release_block(block_id);
+    }
+#endif
+    local_cache.clear();
+}
\ No newline at end of file

From a96e684d276767ff4fc0c6019cf923d9f2707080 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Wed, 4 Feb 2026 16:09:11 +0800
Subject: [PATCH 02/28] add buffer pool & open buffer storage ut

---
 src/ailego/buffer/buffer_manager.cc           |    1 -
 src/core/utility/buffer1_storage.cc           |  438 ++
 src/core/utility/buffer_storage.cc            |    2 +-
 src/include/zvec/ailego/buffer/buffer_pool.h  |  520 ++-
 .../zvec/ailego/buffer/concurrentqueue.h      | 3747 +++++++++++++++++
 ..._test.cpp => flat_streamer_buffer_test.cc} |    0
 ....cpp => flat_streamer_buffer_time_test.cc} |    0
 7 files changed, 4474 insertions(+), 234 deletions(-)
 create mode 100644 src/core/utility/buffer1_storage.cc
 create mode 100644 src/include/zvec/ailego/buffer/concurrentqueue.h
 rename tests/core/algorithm/flat/{flat_streamer_buffer_test.cpp => flat_streamer_buffer_test.cc} (100%)
 rename tests/core/algorithm/flat/{flat_streamer_buffer_time_test.cpp => flat_streamer_buffer_time_test.cc} (100%)

diff --git a/src/ailego/buffer/buffer_manager.cc b/src/ailego/buffer/buffer_manager.cc
index 307e80ce..ac2945b0 100644
--- a/src/ailego/buffer/buffer_manager.cc
+++ b/src/ailego/buffer/buffer_manager.cc
@@ -20,7 +20,6 @@
 #include <zvec/ailego/buffer/buffer_manager.h>
 #include <zvec/ailego/internal/platform.h>
 #include <zvec/ailego/logger/logger.h>
-#include <zvec/ailego/buffer/buffer_pool.h>
 
 #ifdef __clang__
 #pragma clang diagnostic push
diff --git a/src/core/utility/buffer1_storage.cc b/src/core/utility/buffer1_storage.cc
new file mode 100644
index 00000000..0ea591d9
--- /dev/null
+++ b/src/core/utility/buffer1_storage.cc
@@ -0,0 +1,438 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+// #include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
+#include <zvec/core/framework/index_error.h>
+#include <zvec/core/framework/index_factory.h>
+#include <zvec/core/framework/index_mapping.h>
+#include <zvec/core/framework/index_version.h>
+#include "utility_params.h"
+
+#include <zvec/ailego/utility/time_helper.h>
+
+namespace zvec {
+namespace core {
+
+/*! MMap File Storage
+ */
+class Buffer1Storage : public IndexStorage {
+ public:
+  /*! Index Storage Segment
+   */
+  class Segment : public IndexStorage::Segment,
+                  public std::enable_shared_from_this<Segment> {
+   public:
+    //! Index Storage Pointer
+    typedef std::shared_ptr<Segment> Pointer;
+
+    //! Constructor
+    Segment(Buffer1Storage *owner, IndexMapping::Segment *segment, size_t segment_id)
+        : segment_(segment),
+          owner_(owner),
+          segment_id_(segment_id),
+          capacity_(static_cast<size_t>(segment->meta()->data_size +
+                                        segment->meta()->padding_size)) {}
+
+    //! Destructor
+    virtual ~Segment(void) {}
+
+    //! Retrieve size of data
+    size_t data_size(void) const override {
+      return static_cast<size_t>(segment_->meta()->data_size);
+    }
+
+    //! Retrieve crc of data
+    uint32_t data_crc(void) const override {
+      return segment_->meta()->data_crc;
+    }
+
+    //! Retrieve size of padding
+    size_t padding_size(void) const override {
+      return static_cast<size_t>(segment_->meta()->padding_size);
+    }
+
+    //! Retrieve capacity of segment
+    size_t capacity(void) const override {
+      return capacity_;
+    }
+
+    //! Fetch data from segment (with own buffer)
+    size_t fetch(size_t offset, void *buf, size_t len) const override {
+      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
+        auto meta = segment_->meta();
+        if (offset > meta->data_size) {
+          offset = meta->data_size;
+        }
+        len = meta->data_size - offset;
+      }
+      memmove(buf, (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) + offset,
+              len);
+      return len;
+    }
+
+    //! Read data from segment
+    size_t read(size_t offset, const void **data, size_t len) override {
+      
+      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
+        auto meta = segment_->meta();
+        if (offset > meta->data_size) {
+          offset = meta->data_size;
+        }
+        len = meta->data_size - offset;
+      }
+      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
+      *data = owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset;
+      return len;
+    }
+
+    size_t read(size_t offset, MemoryBlock &data, size_t len) override {
+      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
+        auto meta = segment_->meta();
+        if (offset > meta->data_size) {
+          offset = meta->data_size;
+        }
+        len = meta->data_size - offset;
+      }
+      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
+      data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
+      if (data.data()) {
+        return len;
+      } else {
+        LOG_ERROR("read error.");
+        return -1;
+      }
+    }
+
+    //! Write data into the storage with offset
+    size_t write(size_t /*offset*/, const void * /*data*/,
+                 size_t len) override {
+      return len;
+    }
+
+    //! Resize size of data
+    size_t resize(size_t /*size*/) override {
+      return 0;
+    }
+
+    //! Update crc of data
+    void update_data_crc(uint32_t /*crc*/) override {}
+
+    //! Clone the segment
+    IndexStorage::Segment::Pointer clone(void) override {
+      return shared_from_this();
+    }
+
+   private:
+    IndexMapping::Segment *segment_{};
+    Buffer1Storage *owner_{nullptr};
+    size_t capacity_{};
+    size_t segment_id_{};
+  };
+
+  //! Destructor
+  virtual ~Buffer1Storage(void) {
+    this->cleanup();
+  }
+
+  //! Initialize storage
+  int init(const ailego::Params & /*params*/) override {
+    return 0;
+  }
+
+  //! Cleanup storage
+  int cleanup(void) override {
+    this->close_index();
+    return 0;
+  }
+
+  //! Open storage
+  int open(const std::string &path, bool /*create*/) override {
+    LOG_INFO("open buffer storage 1");
+    file_name_ = path;
+    buffer_pool_ = std::make_unique<VecBufferPool>(path, 10u * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_handle_ =
+        std::make_unique<VecBufferPoolHandle>(buffer_pool_->get_handle());
+    int ret = ParseToMapping();
+    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
+    if(ret != 0) {
+      return ret;
+    }
+    return 0;
+  }
+
+  char *get_buffer(size_t offset, size_t length, size_t block_id) {
+    return buffer_pool_handle_->get_block(offset, length, block_id);
+  }
+
+  int get_meta(size_t offset, size_t length, char *out) {
+    return buffer_pool_handle_->get_meta(offset, length, out);
+  }
+
+  int ParseHeader(size_t offset) {
+    char *buffer = new char[sizeof(header_)];
+    get_meta(offset, sizeof(header_), buffer);
+    uint8_t *header_ptr = reinterpret_cast<uint8_t *>(buffer);
+    memcpy(&header_, header_ptr, sizeof(header_));
+    delete[] buffer;
+    if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) {
+      LOG_ERROR("Header meta size is invalid.");
+      return IndexError_InvalidLength;
+    }
+    if (ailego::Crc32c::Hash(&header_, sizeof(header_), header_.header_crc) !=
+        header_.header_crc) {
+      LOG_ERROR("Header meta checksum is invalid.");
+      return IndexError_InvalidChecksum;
+    }
+    return 0;
+  }
+
+  int ParseFooter(size_t offset) {
+    char *buffer = new char[sizeof(footer_)];
+    get_meta(offset, sizeof(footer_), buffer);
+    uint8_t *footer_ptr = reinterpret_cast<uint8_t *>(buffer);
+    memcpy(&footer_, footer_ptr, sizeof(footer_));
+    delete[] buffer;
+    if (offset < (size_t)footer_.segments_meta_size) {
+      LOG_ERROR("Footer meta size is invalid.");
+      return IndexError_InvalidLength;
+    }
+    if (ailego::Crc32c::Hash(&footer_, sizeof(footer_), footer_.footer_crc) !=
+        footer_.footer_crc) {
+      LOG_ERROR("Footer meta checksum is invalid.");
+      return IndexError_InvalidChecksum;
+    }
+    return 0;
+  }
+
+  int ParseSegment(size_t offset) {
+    segment_buffer_ = std::make_unique<char[]>(footer_.segments_meta_size);
+    get_meta(offset, footer_.segments_meta_size, segment_buffer_.get());
+    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size, 0u) !=
+        footer_.segments_meta_crc) {
+      LOG_ERROR("Index segments meta checksum is invalid.");
+      return IndexError_InvalidChecksum;
+    }
+    IndexFormat::SegmentMeta *segment_start =
+        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer_.get());
+    uint32_t segment_ids_offset = footer_.segments_meta_size;
+    for (IndexFormat::SegmentMeta *iter = segment_start,
+                                  *end = segment_start + footer_.segment_count;
+         iter != end; ++iter) {
+      if (iter->segment_id_offset > footer_.segments_meta_size) {
+        return IndexError_InvalidValue;
+      }
+      if (iter->data_index > footer_.content_size) {
+        return IndexError_InvalidValue;
+      }
+      if (iter->data_index + iter->data_size > footer_.content_size) {
+        return IndexError_InvalidLength;
+      }
+
+      if (iter->segment_id_offset < segment_ids_offset) {
+        segment_ids_offset = iter->segment_id_offset;
+      }
+      id_hash_.emplace(
+          std::string(reinterpret_cast<const char *>(segment_start) +
+                      iter->segment_id_offset),
+          segments_.size());
+      segments_.emplace(
+          std::string(reinterpret_cast<const char *>(segment_start) +
+                      iter->segment_id_offset),
+          iter);
+      max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size);
+      if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
+          footer_.segments_meta_size) {
+        return IndexError_InvalidLength;
+      }
+    }
+    return 0;
+  }
+
+  int ParseToMapping() {
+    ParseHeader(0);
+    // Unpack footer
+    if (header_.meta_footer_size != sizeof(IndexFormat::MetaFooter)) {
+      return IndexError_InvalidLength;
+    }
+    if ((int32_t)header_.meta_footer_offset < 0) {
+      return IndexError_Unsupported;
+    }
+    size_t footer_offset = header_.meta_footer_offset;
+    ParseFooter(footer_offset);
+
+    // Unpack segment table
+    if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
+        footer_.segments_meta_size) {
+      return IndexError_InvalidLength;
+    }
+    const size_t segment_start_offset = footer_offset - footer_.segments_meta_size;
+    ParseSegment(segment_start_offset);
+    return 0;
+  }
+
+  //! Flush storage
+  int flush(void) override {
+    return this->flush_index();
+  }
+
+  //! Close storage
+  int close(void) override {
+    this->close_index();
+    return 0;
+  }
+
+  //! Append a segment into storage
+  int append(const std::string &id, size_t size) override {
+    return this->append_segment(id, size);
+  }
+
+  //! Refresh meta information (checksum, update time, etc.)
+  void refresh(uint64_t chkp) override {
+    this->refresh_index(chkp);
+  }
+
+  //! Retrieve check point of storage
+  uint64_t check_point(void) const override {
+    return footer_.check_point;
+  }
+
+  //! Retrieve a segment by id
+  IndexStorage::Segment::Pointer get(const std::string &id, int) override {
+    IndexMapping::Segment *segment = this->get_segment(id);
+    if (!segment) {
+      return Buffer1Storage::Segment::Pointer();
+    }
+    return std::make_shared<Buffer1Storage::Segment>(this, segment,
+                                                     id_hash_[id]);
+  }
+
+  //! Test if it a segment exists
+  bool has(const std::string &id) const override {
+    return this->has_segment(id);
+  }
+
+  //! Retrieve magic number of index
+  uint32_t magic(void) const override {
+    return header_.magic;
+  }
+
+  uint32_t get_context_offset() {
+    return header_.content_offset;
+  }
+
+ protected:
+  //! Initialize index version segment
+  int init_version_segment(void) {
+    size_t data_size = std::strlen(IndexVersion::Details());
+    int error_code =
+        this->append_segment(INDEX_VERSION_SEGMENT_NAME, data_size);
+    if (error_code != 0) {
+      return error_code;
+    }
+
+    IndexMapping::Segment *segment = get_segment(INDEX_VERSION_SEGMENT_NAME);
+    if (!segment) {
+      return IndexError_MMapFile;
+    }
+    auto meta = segment->meta();
+    size_t capacity = static_cast<size_t>(meta->padding_size + meta->data_size);
+    memcpy(segment->data(), IndexVersion::Details(), data_size);
+    segment->set_dirty();
+    meta->data_crc = ailego::Crc32c::Hash(segment->data(), data_size, 0);
+    meta->data_size = data_size;
+    meta->padding_size = capacity - data_size;
+    return 0;
+  }
+
+  //! Initialize index file
+  int init_index(const std::string &path) {
+    // Add index version
+    int error_code = this->init_version_segment();
+    if (error_code != 0) {
+      return error_code;
+    }
+
+    // Refresh mapping
+    this->refresh_index(0);
+    return 0;
+  }
+
+  //! Set the index file as dirty
+  void set_as_dirty(void) {
+    index_dirty_ = true;
+  }
+
+  //! Refresh meta information (checksum, update time, etc.)
+  void refresh_index(uint64_t /*chkp*/) {}
+
+  //! Flush index storage
+  int flush_index(void) {
+    return 0;
+  }
+
+  //! Close index storage
+  void close_index(void) {
+    std::lock_guard<std::mutex> latch(mapping_mutex_);
+    file_name_.clear();
+    segments_.clear();
+    memset(&header_, 0, sizeof(header_));
+    memset(&footer_, 0, sizeof(footer_));
+    segment_buffer_.release();
+  }
+
+  //! Append a segment into storage
+  int append_segment(const std::string & /*id*/, size_t /*size*/) {
+    return 0;
+  }
+
+  //! Test if a segment exists
+  bool has_segment(const std::string &id) const {
+    std::lock_guard<std::mutex> latch(mapping_mutex_);
+    return (segments_.find(id) != segments_.end());
+  }
+
+  //! Get a segment from storage
+  IndexMapping::Segment *get_segment(const std::string &id) {
+    std::lock_guard<std::mutex> latch(mapping_mutex_);
+    auto iter = segments_.find(id);
+    if (iter == segments_.end()) {
+      return nullptr;
+    }
+    IndexMapping::Segment *item = &iter->second;
+    return item;
+  }
+
+ private:
+  bool index_dirty_{false};
+  mutable std::mutex mapping_mutex_{};
+
+  // buffer manager
+  std::string file_name_;
+  IndexFormat::MetaHeader header_;
+  IndexFormat::MetaFooter footer_;
+  std::map<std::string, IndexMapping::Segment> segments_{};
+  std::map<std::string, size_t> id_hash_{};
+  size_t max_segment_size_{0};
+  std::unique_ptr<char[]> segment_buffer_{nullptr};
+
+  std::unique_ptr<VecBufferPool> buffer_pool_{nullptr};
+  std::unique_ptr<VecBufferPoolHandle> buffer_pool_handle_{nullptr};
+};
+
+INDEX_FACTORY_REGISTER_STORAGE_ALIAS(BufferStorage, Buffer1Storage);
+
+}  // namespace core
+}  // namespace zvec
\ No newline at end of file
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index 4ac3c6b3..d4b23c87 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -436,7 +436,7 @@ class BufferStorage : public IndexStorage {
   std::map<std::string, IndexMapping::Segment> segments_{};
 };
 
-INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
+// INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
 
 }  // namespace core
 }  // namespace zvec
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
index 5a09abfa..d86cffec 100644
--- a/src/include/zvec/ailego/buffer/buffer_pool.h
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -1,311 +1,367 @@
 #pragma once
 
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
 #include <atomic>
 #include <cassert>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <iostream>
+#include <limits>
+#include <map>
 #include <mutex>
 #include <queue>
+#include <stdexcept>
 #include <string>
 #include <unordered_map>
-#include <map>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <stdexcept>
-#include <limits>
-#include <iostream>
-#include <boost/lockfree/spsc_queue.hpp>
+#include "concurrentqueue.h"
 
 using block_id_t = int;
-
-#define BLOCK_SIZE (4 * 1024 * 1024)  // 2 MB
-#define BLOCK_MASK (BLOCK_SIZE - 1)
-#define BLOCK_ID(offset) (offset >> 22)
-#define BLOCK_OFFSET(offset) (offset & BLOCK_MASK)
+using version_t = int;
 
 class LRUCache {
-    boost::lockfree::spsc_queue<int, boost::lockfree::capacity<1024>> q;
-};
-
-class LPMap {
-    struct Entry {
-        std::atomic<int> ref_count;
-        char* buffer;
-    };
-
   public:
-    LPMap() : entry_num_(0), entries_(nullptr) {}
-    ~LPMap() {
-        delete[] entries_;
+    typedef std::pair<block_id_t, version_t> BlockType;
+    typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
+
+    int init(size_t block_size) {
+      for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
+        queues_.push_back(ConcurrentQueue(block_size));
+      }
+      return 0;
     }
 
-    void init(size_t entry_num) {
-        if (entries_) {
-            delete[] entries_;
-        }
-        entry_num_ = entry_num;
-        entries_ = new Entry[entry_num_];
-        for (size_t i = 0; i < entry_num_; i++) {
-            // entries_[i].ref_count.store(0);
-            entries_[i].ref_count.store(std::numeric_limits<int>::min());
-            entries_[i].buffer = nullptr;
+    BlockType evict_single_block() {
+      BlockType item;
+      for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
+        bool found = queues_[i].try_dequeue(item);
+        if(found) {
+          break;
         }
+      }
+      return item;
     }
 
-    char* acquire_block(block_id_t block_id) {
-        assert(block_id < entry_num_);
-        Entry& entry = entries_[block_id];
-        int rc = entry.ref_count.fetch_add(1);
-        if (rc < 0) {
-            return nullptr;
-        }
-        return entry.buffer;
+    bool add_single_block(const BlockType &block, int block_type) {
+      std::cout << "in LRU: " << block.first << ", " << block.second << std::endl;
+      return queues_[block_type].try_enqueue(block);
     }
 
-    void release_block(block_id_t block_id) {
-        assert(block_id < entry_num_);
-        Entry& entry = entries_[block_id];
-        int rc = entry.ref_count.fetch_sub(1);
-        assert(rc > 0);
-    }
+  private:
+    constexpr static size_t CATCH_QUEUE_NUM = 3;
+    std::vector<ConcurrentQueue> queues_;
+};
 
-    // need be called under lock
-    char* evict_block(block_id_t block_id) {
-        assert(block_id < entry_num_);
-        Entry& entry = entries_[block_id];
-        int expected = 0;
-        if (entry.ref_count.compare_exchange_strong(expected, std::numeric_limits<int>::min())) {
-            char* buffer = entry.buffer;
-            entry.buffer = nullptr;
-            return buffer;
-        } else {
-            return nullptr;
-        }
+class LPMap {
+  struct Entry {
+    std::atomic<int> ref_count;
+    std::atomic<int> load_count;
+    char *buffer;
+  };
+
+ public:
+  LPMap() : entry_num_(0), entries_(nullptr) {}
+  ~LPMap() {
+    delete[] entries_;
+  }
+
+  void init(size_t entry_num) {
+    if (entries_) {
+      delete[] entries_;
     }
-
-    // need be called under lock
-    char* set_block_acquired(block_id_t block_id, char* buffer) {
-        // std::cout << "Set block " << block_id << std::endl;
-        assert(block_id < entry_num_);
-        Entry& entry = entries_[block_id];
-        if (entry.ref_count.load() >= 0) {
-            entry.ref_count.fetch_add(1);
-            return entry.buffer;
-        }
-        entry.buffer = buffer;
-        entry.ref_count.store(1);
-        return buffer;
+    entry_num_ = entry_num;
+    entries_ = new Entry[entry_num_];
+    for (size_t i = 0; i < entry_num_; i++) {
+      entries_[i].ref_count.store(std::numeric_limits<int>::min());
+      entries_[i].load_count.store(0);
+      entries_[i].buffer = nullptr;
     }
-
-    // need be called under lock
-    void recycle(std::queue<char*>& free_buffers) {
-        for (size_t i = 0; i < entry_num_; i++) {
-            Entry& entry = entries_[i];
-            if (entry.ref_count.load() == 0) {
-                char* buffer = evict_block(i);
-                if (buffer) {
-                    free_buffers.push(buffer);
-                }
-            }
-        }
+  }
+
+  char *acquire_block(block_id_t block_id) {
+    assert(block_id < entry_num_);
+    Entry &entry = entries_[block_id];
+    int rc = entry.ref_count.fetch_add(1);
+    if (rc < 0) {
+      return nullptr;
     }
-
-    size_t entry_num() const {
-        return entry_num_;
+    return entry.buffer;
+  }
+
+  void release_block(block_id_t block_id) {
+    assert(block_id < entry_num_);
+    Entry &entry = entries_[block_id];
+    int rc = entry.ref_count.fetch_sub(1);
+    assert(rc >= 0);
+    if(rc == 0) {
+      LRUCache::BlockType block;
+      block.first = block_id;
+      block.second = entry.load_count.load();
+      cache_.add_single_block(block, 0);
+    }
+  }
+
+  // need be called under lock
+  char *evict_block(block_id_t block_id) {
+    assert(block_id < entry_num_);
+    Entry &entry = entries_[block_id];
+    int expected = 0;
+    if (entry.ref_count.compare_exchange_strong(
+            expected, std::numeric_limits<int>::min())) {
+      char *buffer = entry.buffer;
+      entry.buffer = nullptr;
+      return buffer;
+    } else {
+      return nullptr;
     }
+  }
+
+  // need be called under lock
+  char *set_block_acquired(block_id_t block_id, char *buffer) {
+    // std::cout << "Set block " << block_id << std::endl;
+    assert(block_id < entry_num_);
+    Entry &entry = entries_[block_id];
+    if (entry.ref_count.load() >= 0) {
+      entry.ref_count.fetch_add(1);
+      return entry.buffer;
+    }
+    entry.buffer = buffer;
+    entry.ref_count.store(1);
+    entry.load_count.fetch_add(1);
+    return buffer;
+  }
+
+  // need be called under lock
+  void recycle(std::queue<char *> &free_buffers) {
+    LRUCache::BlockType block;
+    do {
+      block = cache_.evict_single_block();
+    } while(isDeadBlock(block));
+    char *buffer = evict_block(block.first);
+    if (buffer) {
+      free_buffers.push(buffer);
+    }
+  }
 
-  private:
-    Entry* entries_;
-    size_t entry_num_;
+  size_t entry_num() const {
+    return entry_num_;
+  }
+
+ private:
+  Entry *entries_;
+  size_t entry_num_;
+  LRUCache cache_;
+
+  bool isDeadBlock(LRUCache::BlockType block) {
+    Entry &entry = entries_[block.first];
+    return block.second == entry.load_count.load();
+  }
 };
 
-class BufferPool;
+class VecBufferPool;
 
-struct BufferPoolHandle {
-    BufferPoolHandle(BufferPool& pool);
-    BufferPoolHandle(BufferPoolHandle&& other) : pool(other.pool), local_cache(std::move(other.local_cache)), hit_num_(other.hit_num_) {
-        other.local_cache.clear();
-        other.hit_num_ = 0;
-    }
-    ~BufferPoolHandle();
+struct VecBufferPoolHandle {
+  VecBufferPoolHandle(VecBufferPool &pool);
+  VecBufferPoolHandle(VecBufferPoolHandle &&other)
+      : pool(other.pool),
+        local_cache(std::move(other.local_cache)),
+        hit_num_(other.hit_num_) {
+    other.local_cache.clear();
+    other.hit_num_ = 0;
+  }
+  ~VecBufferPoolHandle();
 
-    char* get_block(size_t offset, size_t size);
+  char *get_block(size_t offset, size_t size, size_t block_id);
 
-    void release_all();
+  int get_meta(size_t offset, size_t length, char *buffer);
 
-    BufferPool& pool;
+  void release_all();
+
+  VecBufferPool &pool;
 #ifdef USE_LOCAL_CACHE
-    // std::unordered_map<block_id_t, char*> local_cache;
-    phmap::flat_hash_map<block_id_t, char*> local_cache;
+  // std::unordered_map<block_id_t, char*> local_cache;
+  phmap::flat_hash_map<block_id_t, char *> local_cache;
 #else
-    std::vector<block_id_t> local_cache;
+  std::vector<block_id_t> local_cache;
 #endif
-    int hit_num_;
+  int hit_num_;
 };
 
-class BufferPool {
-  public:
-    BufferPool(const std::string& filename, size_t pool_capacity) : pool_capacity_(pool_capacity){
-        fd_ = open(filename.c_str(), O_RDONLY);
-        if (fd_ < 0) {
-            throw std::runtime_error("Failed to open file: " + filename);
-        }
-        struct stat st;
-        if (fstat(fd_, &st) < 0) {
-            throw std::runtime_error("Failed to stat file: " + filename);
-        }
-        file_size_ = st.st_size;
-        lp_map_.init((file_size_ + BLOCK_SIZE - 1) / BLOCK_SIZE);
-
-        size_t buffer_num = pool_capacity_ / BLOCK_SIZE;
-        for (size_t i = 0; i < buffer_num; i++) {
-            char* buffer = (char*)aligned_alloc(64, BLOCK_SIZE);
-            free_buffers_.push(buffer);
-        }
-        std::cout << "buffer_num: " << buffer_num << std::endl;
-        std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+class VecBufferPool {
+ public:
+  VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size)
+      : pool_capacity_(pool_capacity) {
+    fd_ = open(filename.c_str(), O_RDONLY);
+    if (fd_ < 0) {
+      throw std::runtime_error("Failed to open file: " + filename);
     }
-    ~BufferPool() {
-        close(fd_);
+    struct stat st;
+    if (fstat(fd_, &st) < 0) {
+      throw std::runtime_error("Failed to stat file: " + filename);
     }
+    file_size_ = st.st_size;
 
-    BufferPoolHandle get_handle() {
-        return BufferPoolHandle(*this);
+    size_t buffer_num = pool_capacity_ / block_size;
+    lp_map_.init(buffer_num);
+    for (size_t i = 0; i < buffer_num; i++) {
+      char *buffer = (char *)aligned_alloc(64, block_size);
+      free_buffers_.push(buffer);
     }
-
-    char* acquire_buffer(block_id_t block_id, int retry = 0) {
-        char* buffer = lp_map_.acquire_block(block_id);
-        if (buffer) {
-            return buffer;
-        }
-        {
-            std::lock_guard<std::mutex> lock(mutex_);
-            if (free_buffers_.empty()) {
-                for (int i = 0; i < retry; i++) {
-                    lp_map_.recycle(free_buffers_);
-                    if (!free_buffers_.empty()) {
-                        break;
-                    }
-                }
-            }
-            if (free_buffers_.empty()) {
-                return nullptr;
-            }
-            buffer = free_buffers_.front();
-            free_buffers_.pop();
+    std::cout << "buffer_num: " << buffer_num << std::endl;
+    std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+  }
+  ~VecBufferPool() {
+    close(fd_);
+  }
+
+  VecBufferPoolHandle get_handle() {
+    return VecBufferPoolHandle(*this);
+  }
+
+  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry = 0) {
+    char *buffer = lp_map_.acquire_block(block_id);
+    if (buffer) {
+      return buffer;
+    }
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (free_buffers_.empty()) {
+        for (int i = 0; i < retry; i++) {
+          lp_map_.recycle(free_buffers_);
+          if (!free_buffers_.empty()) {
+            break;
+          }
         }
-        size_t read_offset = static_cast<size_t>(block_id) * BLOCK_SIZE;
-        size_t to_read = std::min<size_t>(BLOCK_SIZE, file_size_ - read_offset);
+      }
+      if (free_buffers_.empty()) {
+        return nullptr;
+      }
+      buffer = free_buffers_.front();
+      free_buffers_.pop();
+    }
 
-        ssize_t read_bytes = pread(fd_, buffer, to_read, read_offset);
-        if (read_bytes != static_cast<ssize_t>(to_read)) {
-            std::cerr << "Failed to read file at offset " << read_offset << std::endl;
-            exit(-1);
-        }
+    ssize_t read_bytes = pread(fd_, buffer, size, offset);
+    if (read_bytes != static_cast<ssize_t>(size)) {
+      std::cerr << "Failed to read file at offset " << offset << std::endl;
+      exit(-1);
+    }
 
-        {
-            std::lock_guard<std::mutex> lock(mutex_);
-            char* placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
-            if (placed_buffer != buffer) {
-                // another thread has set the block
-                free_buffers_.push(buffer);
-            }
-            return placed_buffer;
-        }
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      char *placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
+      if (placed_buffer != buffer) {
+        // another thread has set the block
+        free_buffers_.push(buffer);
+      }
+      return placed_buffer;
     }
+  }
 
-    size_t file_size() const {
-        return file_size_;
+  int get_meta(size_t offset, size_t length, char *buffer) {
+    ssize_t read_bytes = pread(fd_, buffer, length, offset);
+    if (read_bytes != static_cast<ssize_t>(length)) {
+      std::cerr << "Failed to read file at offset " << offset << std::endl;
+      exit(-1);
     }
+    return 0;
+  }
 
-  private:
-    int fd_;
-    size_t file_size_;
-    size_t pool_capacity_;
+  size_t file_size() const {
+    return file_size_;
+  }
 
-  public:
-    LPMap lp_map_;
+ private:
+  int fd_;
+  size_t file_size_;
+  size_t pool_capacity_;
 
-  private:
-    std::mutex mutex_;
-    std::queue<char*> free_buffers_;
+ public:
+  LPMap lp_map_;
+
+ private:
+  std::mutex mutex_;
+  std::queue<char *> free_buffers_;
 };
 
 
 struct Counter {
-    ~Counter() = default;
-
-    static Counter& get_instance() {
-        static Counter instance;
-        return instance;
+  ~Counter() = default;
+
+  static Counter &get_instance() {
+    static Counter instance;
+    return instance;
+  }
+
+  void record(const std::string &name, int64_t value) {
+    auto it = static_counters.find(name);
+    if (it == static_counters.end()) {
+      auto counter = std::make_unique<std::atomic<int64_t>>(0);
+      it = static_counters.emplace(name, std::move(counter)).first;
     }
+    it->second->fetch_add(value);
+  }
 
-    void record(const std::string& name, int64_t value) {
-        auto it = static_counters.find(name);
-        if (it == static_counters.end()) {
-            auto counter = std::make_unique<std::atomic<int64_t>>(0);
-            it = static_counters.emplace(name, std::move(counter)).first;
-        }
-        it->second->fetch_add(value);
+  void display() {
+    for (const auto &pair : static_counters) {
+      std::cout << pair.first << ": " << pair.second->load() << std::endl;
     }
+  }
 
-    void display() {
-        for (const auto& pair : static_counters) {
-            std::cout << pair.first << ": " << pair.second->load() << std::endl;
-        }
-    }
-
-    void clear() {
-        static_counters.clear();
-    }
+  void clear() {
+    static_counters.clear();
+  }
 
-  private:
-    Counter() {}
-    std::map<std::string, std::unique_ptr<std::atomic<int64_t>>> static_counters;
+ private:
+  Counter() {}
+  std::map<std::string, std::unique_ptr<std::atomic<int64_t>>> static_counters;
 };
 
-BufferPoolHandle::BufferPoolHandle(BufferPool& pool) : pool(pool), hit_num_(0) {}
-BufferPoolHandle::~BufferPoolHandle() {
-    Counter::get_instance().record("buffer_pool_handle_hit_num", hit_num_);
-    release_all();
+VecBufferPoolHandle::VecBufferPoolHandle(VecBufferPool &pool)
+    : pool(pool), hit_num_(0) {}
+VecBufferPoolHandle::~VecBufferPoolHandle() {
+  Counter::get_instance().record("buffer_pool_handle_hit_num", hit_num_);
+  release_all();
 }
 
-char* BufferPoolHandle::get_block(size_t offset, size_t size) {
-    block_id_t block_id = BLOCK_ID(offset);
-    assert(block_id == BLOCK_ID(offset + size - 1));
+char *VecBufferPoolHandle::get_block(size_t offset, size_t size, size_t block_id) {
 #ifdef USE_LOCAL_CACHE
-    auto it = local_cache.find(block_id);
-    if (it != local_cache.end()) {
-        hit_num_++;
-        return it->second + BLOCK_OFFSET(offset);
-    }
+  auto it = local_cache.find(block_id);
+  if (it != local_cache.end()) {
+    hit_num_++;
+    return it->second;
+  }
 #endif
 
-    char* buffer = pool.acquire_buffer(block_id, 3);
-    if (buffer) {
+  char *buffer = pool.acquire_buffer(block_id, offset, size, 3);
+  if (buffer) {
 #ifdef USE_LOCAL_CACHE
-        local_cache[block_id] = buffer;
+    local_cache[block_id] = buffer;
 #else
-        local_cache.push_back(block_id);
+    local_cache.push_back(block_id);
 #endif
-        return buffer + BLOCK_OFFSET(offset);
-    }
+    return buffer;
+  }
 
-    return nullptr;
+  return nullptr;
 }
 
-void BufferPoolHandle::release_all() {
+int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *out) {
+  return pool.get_meta(offset, length, out);
+}
+
+void VecBufferPoolHandle::release_all() {
 #ifdef USE_LOCAL_CACHE
-    Counter::get_instance().record("buffer_pool_handle_release_call", local_cache.size());
-    for (const auto& pair : local_cache) {
-        pool.lp_map_.release_block(pair.first);
-    }
+  Counter::get_instance().record("buffer_pool_handle_release_call",
+                                 local_cache.size());
+  for (const auto &pair : local_cache) {
+    pool.lp_map_.release_block(pair.first);
+  }
 #else
-    for (block_id_t block_id : local_cache) {
-        pool.lp_map_.release_block(block_id);
-    }
+  for (block_id_t block_id : local_cache) {
+    pool.lp_map_.release_block(block_id);
+  }
 #endif
-    local_cache.clear();
+  local_cache.clear();
 }
\ No newline at end of file
diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
new file mode 100644
index 00000000..db4835b1
--- /dev/null
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -0,0 +1,3747 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2020, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+
+#pragma once
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
+// does not support `if constexpr`, so we have no choice but to simply disable the warning
+#pragma warning(push)
+#pragma warning(disable: 4127)  // conditional expression is constant
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <mutex>        // used for thread exit synchronization
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+namespace moodycamel { namespace details {
+	template<typename thread_id_t> struct thread_id_converter {
+		typedef thread_id_t thread_id_numeric_size_t;
+		typedef thread_id_t thread_id_hash_t;
+		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+	};
+} }
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel { namespace details {
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+	static inline thread_id_t thread_id() { return rl::thread_index(); }
+} }
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel { namespace details {
+	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+	
+	typedef std::thread::id thread_id_t;
+	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
+
+	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+	// be.
+	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+	template<std::size_t> struct thread_id_size { };
+	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
+	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
+
+	template<> struct thread_id_converter<thread_id_t> {
+		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+#ifndef __APPLE__
+		typedef std::size_t thread_id_hash_t;
+#else
+		typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+		static thread_id_hash_t prehash(thread_id_t const& x)
+		{
+#ifndef __APPLE__
+			return std::hash<std::thread::id>()(x);
+#else
+			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+		}
+	};
+} }
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
+#endif
+
+// Constexpr if
+#ifndef MOODYCAMEL_CONSTEXPR_IF
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#define MOODYCAMEL_CONSTEXPR_IF if constexpr
+#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+#else
+#define MOODYCAMEL_CONSTEXPR_IF if
+#define MOODYCAMEL_MAYBE_UNUSED
+#endif
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
+// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+namespace moodycamel { namespace details {
+#ifndef MOODYCAMEL_ALIGNAS
+// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
+	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
+	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
+	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
+	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
+	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
+	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
+	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
+	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
+	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
+#else
+	template<typename T> struct identity { typedef T type; };
+#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+#endif
+#endif
+} }
+
+
+// TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
+// we can apply per-function compile-time suppression.
+// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+#define MOODYCAMEL_NO_TSAN
+#if defined(__has_feature)
+ #if __has_feature(thread_sanitizer)
+  #undef MOODYCAMEL_NO_TSAN
+  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+ #endif // TSAN
+#endif // TSAN
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
+	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+#else
+	static inline bool (likely)(bool x) { return x; }
+	static inline bool (unlikely)(bool x) { return x; }
+#endif
+} }
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+	// The number of times to spin before sleeping when waiting on a semaphore.
+	// Recommended values are on the order of 1000-10000 unless the number of
+	// consumer threads exceeds the number of idle cores (in which case try 0-100).
+	// Only affects instances of the BlockingConcurrentQueue.
+	static const int MAX_SEMA_SPINS = 10000;
+
+	// Whether to recycle dynamically-allocated blocks into an internal free list or
+	// not. If false, only pre-allocated blocks (controlled by the constructor
+	// arguments) will be recycled, and all others will be `free`d back to the heap.
+	// Note that blocks consumed by explicit producers are only freed on destruction
+	// of the queue (not following destruction of the token) regardless of this trait.
+	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
+	
+#ifndef MCDBGQ_USE_RELACY
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+	// Debug versions when running under the Relacy race detector (ignore
+	// these in user code)
+	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr)
+		{
+		}
+	};
+	
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+	
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+			thread_id_converter<thread_id_t>::prehash(id)));
+	}
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = left.load(std::memory_order_relaxed);
+		left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		right.store(temp, std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+	typedef RelacyThreadExitListener ThreadExitListener;
+	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+	class ThreadExitNotifier;
+
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+		
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
+	};
+
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			std::lock_guard<std::mutex> guard(mutex());
+			listener->next = tlsInst.tail;
+			listener->chain = &tlsInst;
+			tlsInst.tail = listener;
+		}
+		
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			std::lock_guard<std::mutex> guard(mutex());
+			if (!listener->chain) {
+				return;  // race with ~ThreadExitNotifier
+			}
+			auto& tlsInst = *listener->chain;
+			listener->chain = nullptr;
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+		
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			std::lock_guard<std::mutex> guard(mutex());
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->chain = nullptr;
+				ptr->callback(ptr->userData);
+			}
+		}
+		
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+
+		static inline std::mutex& mutex()
+		{
+			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
+			static std::mutex mutex;
+			return mutex;
+		}
+		
+	private:
+		ThreadExitListener* tail;
+	};
+#endif
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	template<typename T, typename Traits>
+	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	template<typename T, typename Traits>
+	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		// Track all the producers using a fully-resolved typed list for
+		// each kind; this makes it possible to debug them starting from
+		// the root queue object (otherwise wacky casts are needed that
+		// don't compile in the debugger's expression evaluator).
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+		populate_initial_block_list(blocks);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy implicit producer hash tables
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					(Traits::free)(hash);
+				}
+				hash = prev;
+			}
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+		
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+		
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+		
+		reown_producers();
+	}
+	
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+		
+		swap_implicit_producer_hashes(other);
+		
+		reown_producers();
+		other.reown_producers();
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		details::swap_relaxed(explicitProducers, other.explicitProducers);
+		details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+		
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+	}
+	
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if ((details::likely)(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios. This is mostly only useful for internal unit tests.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+		
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+	
+	
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+	
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static constexpr bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	struct ExplicitProducer;
+	friend struct ExplicitProducer;
+	struct ImplicitProducer;
+	friend struct ImplicitProducer;
+	friend class ConcurrentQueueTests;
+		
+	enum AllocationMode { CanAlloc, CannotAlloc };
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if ((details::unlikely)(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+		
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+		debug::DebugMutex mutex;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
+		{
+#ifdef MCDBGQ_TRACKMEM
+			owner = nullptr;
+#endif
+		}
+		
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
+		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+		
+#ifdef MCDBGQ_TRACKMEM
+		void* owner;
+#endif
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+public:
+	struct MemStats;
+private:
+#endif
+	
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit_),
+			parent(parent_)
+		{
+		}
+		
+		virtual ~ProducerBase() { }
+		
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+		
+	protected:
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+		
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					this->parent->add_block_to_free_list(block);
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!					
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+						
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							return false;
+						}
+						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+				
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+				
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+				
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+				
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+					
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail. We need to do an acquire-release fence here since it's possible
+					// that whatever condition got us to this point was for an earlier enqueued element (that
+					// we already see the memory effects for), but that by the time we increment somebody else
+					// has incremented it, and we need to see the memory effects for *that* element, which is
+					// in such a case is necessarily visible on the thread that incremented it in the first
+					// place with the more current condition (they must have acquired a tail that is at least
+					// as recent).
+					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					
+					// Determine which block the element is in
+					
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+					
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+					
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+			
+			Block* firstAllocatedBlock = nullptr;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					++pr_blockIndexSlotsUsed;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+				
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			
+			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+				if (firstAllocatedBlock != nullptr)
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+									
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ExplicitProducer* nextExplicitProducer;
+	private:
+#endif
+		
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+	
+	struct ImplicitProducer : public ProducerBase
+	{			
+		ImplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+		
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+			
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+			
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr) {
+						// Free the old block
+						this->parent->add_block_to_free_list(block);
+					}
+					
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+				
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+				this->parent->add_block_to_free_list(this->tailBlock);
+			}
+			
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					(Traits::free)(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+				
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
+#ifdef MCDBGQ_TRACKMEM
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+				
+				this->tailBlock = newBlock;
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+					
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+					
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+					
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+		
+			return false;
+		}
+		
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4706)  // assignment within conditional expression
+#endif
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+			
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+					
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+					
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+									
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+		
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+		
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			if (localBlockIndex == nullptr) {
+				return false;  // this can happen if new_block_index failed in the constructor
+			}
+			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+				
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+			
+			// No room in the old block index, try to allocate another one!
+			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+				return false;
+			}
+			else if (!new_block_index()) {
+				return false;
+			}
+			else {
+				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+				idxEntry = localBlockIndex->index[newTail];
+				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+		}
+		
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+		
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+		
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+		
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>((Traits::malloc)(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+			
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+			
+			blockIndex.store(header, std::memory_order_release);
+			
+			nextBlockIndexCapacity <<= 1;
+			
+			return true;
+		}
+		
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
+#endif
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ImplicitProducer* nextImplicitProducer;
+	private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+		mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
+#ifdef MCDBGQ_TRACKMEM
+		block->owner = nullptr;
+#endif
+		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+			destroy(block);
+		}
+		else {
+			freeList.add(block);
+		}
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+		else {
+			return nullptr;
+		}
+	}
+	
+
+#ifdef MCDBGQ_TRACKMEM
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+			
+			friend class ConcurrentQueue;
+			
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+				
+				stats.elementsEnqueued = q->size_approx();
+			
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+				
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+					
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+				
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+				
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+				
+				return stats;
+			}
+		};
+		
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					return ptr;
+				}
+			}
+		}
+
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		if (producer->isExplicit) {
+			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+		else {
+			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+#endif
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+	
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+		
+		ImplicitProducerKVP() : value(nullptr) { }
+		
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+		
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+		
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+	
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+	
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+	
+	inline void populate_initial_implicit_producer_hash()
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			implicitProducerHashCount.store(0, std::memory_order_relaxed);
+			auto hash = &initialImplicitProducerHash;
+			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+			hash->entries = &initialImplicitProducerHashEntries[0];
+			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+			}
+			hash->prev = nullptr;
+			implicitProducerHash.store(hash, std::memory_order_relaxed);
+		}
+	}
+	
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			// Swap (assumes our implicit producer hash is initialized)
+			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+			
+			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+			
+			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &initialImplicitProducerHash;
+			}
+			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &other.initialImplicitProducerHash;
+			}
+		}
+	}
+	
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+		
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+		
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+		
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1u;
+				
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1u;
+							auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+							auto reusable = details::invalid_thread_id2;
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
+								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#else
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+					
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+		
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				if (newCount >= (mainHash->capacity >> 1)) {
+					size_t newCapacity = mainHash->capacity << 1;
+					while (newCount >= (newCapacity >> 1)) {
+						newCapacity <<= 1;
+					}
+					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+					if (raw == nullptr) {
+						// Allocation failed
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+						return nullptr;
+					}
+					
+					auto newHash = new (raw) ImplicitProducerHash;
+					newHash->capacity = static_cast<size_t>(newCapacity);
+					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+					for (size_t i = 0; i != newCapacity; ++i) {
+						new (newHash->entries + i) ImplicitProducerKVP;
+						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+					}
+					newHash->prev = mainHash;
+					implicitProducerHash.store(newHash, std::memory_order_release);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+					mainHash = newHash;
+				}
+				else {
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				}
+			}
+			
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+				if (producer == nullptr) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+					return nullptr;
+				}
+				
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+				
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1u;
+					auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+					auto reusable = details::invalid_thread_id2;
+					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
+						mainHash->entries[index].value = producer;
+						break;
+					}
+#endif
+					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						mainHash->entries[index].value = producer;
+						break;
+					}
+					++index;
+				}
+				return producer;
+			}
+			
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		
+		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+		// trying to add an entry thinking there's a free slot (because they reused a producer)
+		for (; hash != nullptr; hash = hash->prev) {
+			auto index = hashedId;
+			do {
+				index &= hash->capacity - 1u;
+				probedKey = id;
+				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+					break;
+				}
+				++index;
+			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+		}
+		
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+	
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+
+	template<typename TAlign>
+	static inline void* aligned_malloc(size_t size)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::malloc)(size);
+		else {
+			size_t alignment = std::alignment_of<TAlign>::value;
+			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+			if (!raw)
+				return nullptr;
+			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+			*(reinterpret_cast<void**>(ptr) - 1) = raw;
+			return ptr;
+		}
+	}
+
+	template<typename TAlign>
+	static inline void aligned_free(void* ptr)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::free)(ptr);
+		else
+			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+	}
+
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+		if (p == nullptr)
+			return nullptr;
+
+		for (size_t i = 0; i != count; ++i)
+			new (p + i) U();
+		return p;
+	}
+
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; )
+				(p + --i)->~U();
+		}
+		aligned_free<U>(p);
+	}
+
+	template<typename U>
+	static inline U* create()
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr)
+			p->~U();
+		aligned_free<U>(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
+#ifndef MCDBGQ_USEDEBUGFREELIST
+	FreeList<Block> freeList;
+#else
+	debug::DebugFreeList<Block> freeList;
+#endif
+	
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+	
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+	debug::DebugMutex implicitProdMutex;
+#endif
+	
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	std::atomic<ExplicitProducer*> explicitProducers;
+	std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+#pragma warning(pop)
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_test.cpp b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
similarity index 100%
rename from tests/core/algorithm/flat/flat_streamer_buffer_test.cpp
rename to tests/core/algorithm/flat/flat_streamer_buffer_test.cc
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cpp b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
similarity index 100%
rename from tests/core/algorithm/flat/flat_streamer_buffer_time_test.cpp
rename to tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc

From 03e4dbce5437ed29f3db5bc79a58186541a1935b Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Fri, 6 Feb 2026 21:08:24 +0800
Subject: [PATCH 03/28] modify buffer pool

---
 src/ailego/buffer/buffer_pool.cc              | 257 ++++++++++++++
 src/core/algorithm/hnsw/hnsw_entity.h         |   4 +-
 .../algorithm/hnsw/hnsw_streamer_entity.cc    |   2 +-
 src/core/utility/buffer1_storage.cc           |  14 +-
 src/include/zvec/ailego/buffer/buffer_pool.h  | 331 +++---------------
 .../zvec/core/framework/index_storage.h       |  57 ++-
 .../flat/flat_streamer_buffer_test.cc         | 176 +++++-----
 .../flat/flat_streamer_buffer_time_test.cc    | 129 ++++++-
 8 files changed, 554 insertions(+), 416 deletions(-)
 create mode 100644 src/ailego/buffer/buffer_pool.cc

diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc
new file mode 100644
index 00000000..061ead37
--- /dev/null
+++ b/src/ailego/buffer/buffer_pool.cc
@@ -0,0 +1,257 @@
+#include <zvec/ailego/buffer/buffer_pool.h>
+
+namespace zvec {
+namespace ailego {
+
+void Counter::record(const std::string &name, int64_t value) {
+	auto it = static_counters.find(name);
+	if (it == static_counters.end()) {
+			auto counter = std::make_unique<std::atomic<int64_t>>(0);
+			it = static_counters.emplace(name, std::move(counter)).first;
+	}
+	it->second->fetch_add(value);
+}
+
+void Counter::display() {
+	for (const auto &pair : static_counters) {
+		std::cout << pair.first << ": " << pair.second->load() << std::endl;
+	}
+}
+
+int LRUCache::init(size_t block_size) {
+	block_size_ = block_size;
+	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+		queues_.push_back(ConcurrentQueue(block_size));
+	}
+	return 0;
+}
+
+bool LRUCache::evict_single_block(BlockType &item) {
+	// std::cerr << "dequeue: " << item.first << std::endl;
+	bool found = false;
+	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+		found = queues_[i].try_dequeue(item);
+		// std::cerr << "dequeue: " << found << std::endl;
+		if(found) {
+			break;
+		}
+	}
+	return found;
+}
+
+bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block, int block_type) {
+	bool ok = queues_[block_type].try_enqueue(block);
+	if(++evict_queue_insertions_ % block_size_ == 0) {
+		this->clear_dead_node(lp_map);
+	}
+	return ok;
+}
+
+void LRUCache::clear_dead_node(const LPMap *lp_map) {
+	for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
+		int clear_count = 0;
+		ConcurrentQueue tmp(block_size_);
+		BlockType item;
+		while(queues_[i].try_dequeue(item) && (clear_count++ < block_size_)) {
+			if(!lp_map->isDeadBlock(item)) {
+				tmp.try_enqueue(item);
+			}
+		}
+		while(tmp.try_dequeue(item)) {
+			if(!lp_map->isDeadBlock(item)) {
+				queues_[i].try_enqueue(item);
+			}
+		}
+	}
+}
+
+void LPMap::init(size_t entry_num) {
+	if (entries_) {
+		delete[] entries_;
+	}
+	entry_num_ = entry_num;
+	entries_ = new Entry[entry_num_];
+	for (size_t i = 0; i < entry_num_; i++) {
+		entries_[i].ref_count.store(std::numeric_limits<int>::min());
+		entries_[i].load_count.store(0);
+		entries_[i].buffer = nullptr;
+	}
+	cache_.init(entry_num);
+}
+
+char* LPMap::acquire_block(block_id_t block_id) {
+	assert(block_id < entry_num_);
+	Entry &entry = entries_[block_id];
+	if (entry.ref_count.load() == 0) {
+		++entry.load_count;
+		// std::cout << entry.load_count.load() << std::endl;
+	}
+	++entry.ref_count;
+	// std::cout << entry.ref_count.load() << std::endl;
+	if (entry.ref_count.load() < 0) {
+		// std::cout << "acquire block failed: " << block_id << ", " << entry.ref_count.load() << std::endl;
+		return nullptr;
+	}
+	return entry.buffer;
+}
+
+void LPMap::release_block(block_id_t block_id) {
+	assert(block_id < entry_num_);
+	Entry &entry = entries_[block_id];
+	int rc = entry.ref_count.fetch_sub(1);
+	// std::cout << "release block: " << block_id << ", " << entry.ref_count.load() << std::endl;
+	// assert(rc > 0);
+	if(entry.ref_count.load() == 0) {
+		LRUCache::BlockType block;
+		block.first = block_id;
+		block.second = entry.load_count.load();
+		cache_.add_single_block(this, block, 0);
+	}
+}
+
+char* LPMap::evict_block(block_id_t block_id) {
+	// std::cout << "evict block: " << block_id << std::endl;
+	assert(block_id < entry_num_);
+	Entry &entry = entries_[block_id];
+	int expected = 0;
+	if (entry.ref_count.compare_exchange_strong(
+					expected, std::numeric_limits<int>::min())) {
+		char *buffer = entry.buffer;
+		entry.buffer = nullptr;
+		return buffer;
+	} else {
+		return nullptr;
+	}
+}
+
+char* LPMap::set_block_acquired(block_id_t block_id, char *buffer) {
+	assert(block_id < entry_num_);
+	Entry &entry = entries_[block_id];
+	if (entry.ref_count.load() >= 0) {
+		entry.ref_count.fetch_add(1);
+		// std::cout << "Set block2 " << block_id << std::endl;
+		return entry.buffer;
+	}
+	// if (buffer == nullptr) std::cout << "Set block " << block_id << std::endl;
+	entry.buffer = buffer;
+	entry.ref_count.store(1);
+	entry.load_count.fetch_add(1);
+	return buffer;
+}
+
+void LPMap::recycle(moodycamel::ConcurrentQueue<char *> &free_buffers) {
+	LRUCache::BlockType block;
+	do {
+		bool ok = cache_.evict_single_block(block);
+		if(!ok) {
+			return;
+		}
+	} while(isDeadBlock(block));
+	// std::cout << "evict_block done: " << block.first << ", " << block.second << std::endl;
+	char *buffer = evict_block(block.first);
+	if (buffer) {
+		free_buffers.try_enqueue(buffer);
+	}
+}
+
+VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size)
+		: pool_capacity_(pool_capacity) {
+	fd_ = open(filename.c_str(), O_RDONLY);
+	if (fd_ < 0) {
+		throw std::runtime_error("Failed to open file: " + filename);
+	}
+	struct stat st;
+	if (fstat(fd_, &st) < 0) {
+		throw std::runtime_error("Failed to stat file: " + filename);
+	}
+	file_size_ = st.st_size;
+
+	size_t buffer_num = pool_capacity_ / block_size;
+	size_t block_num = file_size_ / block_size + 500;
+	lp_map_.init(block_num);
+	for (size_t i = 0; i < buffer_num; i++) {
+		char *buffer = (char *)aligned_alloc(64, block_size);
+		if (buffer != nullptr) {
+			bool ok = free_buffers_.try_enqueue(buffer);
+			// if(!ok) std::cerr << i << std::endl;
+		}
+	}
+	std::cout << "buffer_num: " << buffer_num << std::endl;
+	std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+}
+
+VecBufferPoolHandle VecBufferPool::get_handle() {
+	return VecBufferPoolHandle(*this);
+}
+
+char* VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry) {
+	char *buffer = lp_map_.acquire_block(block_id);
+	if (buffer) {
+		return buffer;
+	}
+	{
+		// std::cerr << "block_id: " << block_id << ", offset: " << offset << ", size: " << size << std::endl;
+		// std::lock_guard<std::mutex> lock(mutex_);
+		bool found = free_buffers_.try_dequeue(buffer);
+		// std::cerr << "dequeue: " << found << std::endl;
+		if (!found) {
+			for (int i = 0; i < retry; i++) {
+				lp_map_.recycle(free_buffers_);
+				found = free_buffers_.try_dequeue(buffer);
+				// std::cerr << "dequeue: " << i << std::endl;
+				if (found) {
+					break;
+				}
+			}
+		}
+		if (!found) {
+			std::cerr << "Failed to get free buffer " << std::endl;
+			return nullptr;
+		}
+	}
+
+	ssize_t read_bytes = pread(fd_, buffer, size, offset);
+	if (read_bytes != static_cast<ssize_t>(size)) {
+		std::cerr << "Failed to read file at offset " << offset << std::endl;
+		exit(-1);
+	}
+	char *placed_buffer = nullptr;
+	{
+		std::lock_guard<std::mutex> lock(mutex_);
+		placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
+	}
+	if (placed_buffer != buffer) {
+		// another thread has set the block
+		free_buffers_.try_enqueue(buffer);
+	}
+	return placed_buffer;
+}
+
+int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
+	ssize_t read_bytes = pread(fd_, buffer, length, offset);
+	if (read_bytes != static_cast<ssize_t>(length)) {
+		std::cerr << "Failed to read file at offset " << offset << std::endl;
+		exit(-1);
+	}
+	return 0;
+}
+
+char* VecBufferPoolHandle::get_block(size_t offset, size_t size, size_t block_id) {
+	char *buffer = pool.acquire_buffer(block_id, offset, size, 5);
+	return buffer;
+}
+
+int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) {
+	return pool.get_meta(offset, length, buffer);
+}
+
+void VecBufferPoolHandle::release_one(block_id_t block_id) {
+	pool.lp_map_.release_block(block_id);
+}
+
+void VecBufferPoolHandle::acquire_one(block_id_t block_id) {
+	pool.lp_map_.acquire_block(block_id);
+}
+
+}  // namespace ailego
+}  // namespace zvec
\ No newline at end of file
diff --git a/src/core/algorithm/hnsw/hnsw_entity.h b/src/core/algorithm/hnsw/hnsw_entity.h
index 363a7252..65fdae9e 100644
--- a/src/core/algorithm/hnsw/hnsw_entity.h
+++ b/src/core/algorithm/hnsw/hnsw_entity.h
@@ -147,8 +147,8 @@ struct Neighbors {
   Neighbors(uint32_t cnt_in, const node_id_t *data_in)
       : cnt{cnt_in}, data{data_in} {}
 
-  Neighbors(IndexStorage::MemoryBlock &&mem_block)
-      : neighbor_block{std::move(mem_block)} {
+  Neighbors(IndexStorage::MemoryBlock &mem_block)
+      : neighbor_block{mem_block} {
     auto hd = reinterpret_cast<const NeighborsHeader *>(neighbor_block.data());
     cnt = hd->neighbor_cnt;
     data = hd->neighbors;
diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
index feafa573..734f11f1 100644
--- a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
+++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
@@ -127,7 +127,7 @@ const Neighbors HnswStreamerEntity::get_neighbors(level_t level,
     LOG_ERROR("Read neighbor header failed, ret=%zu", size);
     return Neighbors();
   }
-  return Neighbors(std::move(neighbor_block));
+  return Neighbors(neighbor_block);
 }
 
 //! Get vector data by key
diff --git a/src/core/utility/buffer1_storage.cc b/src/core/utility/buffer1_storage.cc
index 0ea591d9..1c582198 100644
--- a/src/core/utility/buffer1_storage.cc
+++ b/src/core/utility/buffer1_storage.cc
@@ -85,7 +85,6 @@ class Buffer1Storage : public IndexStorage {
 
     //! Read data from segment
     size_t read(size_t offset, const void **data, size_t len) override {
-      
       if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
         auto meta = segment_->meta();
         if (offset > meta->data_size) {
@@ -107,7 +106,8 @@ class Buffer1Storage : public IndexStorage {
         len = meta->data_size - offset;
       }
       size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
-      data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
+      data.reset(owner_->buffer_pool_handle_.get(), segment_id_, owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
+      // data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
       if (data.data()) {
         return len;
       } else {
@@ -138,8 +138,8 @@ class Buffer1Storage : public IndexStorage {
    private:
     IndexMapping::Segment *segment_{};
     Buffer1Storage *owner_{nullptr};
-    size_t capacity_{};
     size_t segment_id_{};
+    size_t capacity_{};
   };
 
   //! Destructor
@@ -162,9 +162,9 @@ class Buffer1Storage : public IndexStorage {
   int open(const std::string &path, bool /*create*/) override {
     LOG_INFO("open buffer storage 1");
     file_name_ = path;
-    buffer_pool_ = std::make_unique<VecBufferPool>(path, 10u * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path, 10u * 1024 * 1024 * 1024, 2490368 * 2);
     buffer_pool_handle_ =
-        std::make_unique<VecBufferPoolHandle>(buffer_pool_->get_handle());
+        std::make_shared<ailego::VecBufferPoolHandle>(buffer_pool_->get_handle());
     int ret = ParseToMapping();
     LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
     if(ret != 0) {
@@ -428,8 +428,8 @@ class Buffer1Storage : public IndexStorage {
   size_t max_segment_size_{0};
   std::unique_ptr<char[]> segment_buffer_{nullptr};
 
-  std::unique_ptr<VecBufferPool> buffer_pool_{nullptr};
-  std::unique_ptr<VecBufferPoolHandle> buffer_pool_handle_{nullptr};
+  ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
+  ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr};
 };
 
 INDEX_FACTORY_REGISTER_STORAGE_ALIAS(BufferStorage, Buffer1Storage);
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
index d86cffec..34c69d51 100644
--- a/src/include/zvec/ailego/buffer/buffer_pool.h
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -16,48 +16,41 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <memory>
 #include "concurrentqueue.h"
 
-using block_id_t = int;
-using version_t = int;
+namespace zvec {
+namespace ailego {
+
+using block_id_t = size_t;
+using version_t = size_t;
+
+class LPMap;
 
 class LRUCache {
   public:
     typedef std::pair<block_id_t, version_t> BlockType;
     typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
 
-    int init(size_t block_size) {
-      for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
-        queues_.push_back(ConcurrentQueue(block_size));
-      }
-      return 0;
-    }
-
-    BlockType evict_single_block() {
-      BlockType item;
-      for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
-        bool found = queues_[i].try_dequeue(item);
-        if(found) {
-          break;
-        }
-      }
-      return item;
-    }
-
-    bool add_single_block(const BlockType &block, int block_type) {
-      std::cout << "in LRU: " << block.first << ", " << block.second << std::endl;
-      return queues_[block_type].try_enqueue(block);
-    }
+    int init(size_t block_size);
+
+    bool evict_single_block(BlockType &item);
+
+    bool add_single_block(const LPMap *lp_map, const BlockType &block, int block_type);
+
+    void clear_dead_node(const LPMap *lp_map);
 
   private:
     constexpr static size_t CATCH_QUEUE_NUM = 3;
+    int block_size_;
     std::vector<ConcurrentQueue> queues_;
+    alignas(64) std::atomic<size_t> evict_queue_insertions_{0};
 };
 
 class LPMap {
   struct Entry {
-    std::atomic<int> ref_count;
-    std::atomic<int> load_count;
+    alignas(64) std::atomic<int> ref_count;
+    alignas(64) std::atomic<version_t> load_count;
     char *buffer;
   };
 
@@ -67,206 +60,52 @@ class LPMap {
     delete[] entries_;
   }
 
-  void init(size_t entry_num) {
-    if (entries_) {
-      delete[] entries_;
-    }
-    entry_num_ = entry_num;
-    entries_ = new Entry[entry_num_];
-    for (size_t i = 0; i < entry_num_; i++) {
-      entries_[i].ref_count.store(std::numeric_limits<int>::min());
-      entries_[i].load_count.store(0);
-      entries_[i].buffer = nullptr;
-    }
-  }
+  void init(size_t entry_num);
 
-  char *acquire_block(block_id_t block_id) {
-    assert(block_id < entry_num_);
-    Entry &entry = entries_[block_id];
-    int rc = entry.ref_count.fetch_add(1);
-    if (rc < 0) {
-      return nullptr;
-    }
-    return entry.buffer;
-  }
+  char *acquire_block(block_id_t block_id);
 
-  void release_block(block_id_t block_id) {
-    assert(block_id < entry_num_);
-    Entry &entry = entries_[block_id];
-    int rc = entry.ref_count.fetch_sub(1);
-    assert(rc >= 0);
-    if(rc == 0) {
-      LRUCache::BlockType block;
-      block.first = block_id;
-      block.second = entry.load_count.load();
-      cache_.add_single_block(block, 0);
-    }
-  }
+  void release_block(block_id_t block_id);
 
   // need be called under lock
-  char *evict_block(block_id_t block_id) {
-    assert(block_id < entry_num_);
-    Entry &entry = entries_[block_id];
-    int expected = 0;
-    if (entry.ref_count.compare_exchange_strong(
-            expected, std::numeric_limits<int>::min())) {
-      char *buffer = entry.buffer;
-      entry.buffer = nullptr;
-      return buffer;
-    } else {
-      return nullptr;
-    }
-  }
+  char *evict_block(block_id_t block_id);
 
   // need be called under lock
-  char *set_block_acquired(block_id_t block_id, char *buffer) {
-    // std::cout << "Set block " << block_id << std::endl;
-    assert(block_id < entry_num_);
-    Entry &entry = entries_[block_id];
-    if (entry.ref_count.load() >= 0) {
-      entry.ref_count.fetch_add(1);
-      return entry.buffer;
-    }
-    entry.buffer = buffer;
-    entry.ref_count.store(1);
-    entry.load_count.fetch_add(1);
-    return buffer;
-  }
+  char *set_block_acquired(block_id_t block_id, char *buffer);
 
   // need be called under lock
-  void recycle(std::queue<char *> &free_buffers) {
-    LRUCache::BlockType block;
-    do {
-      block = cache_.evict_single_block();
-    } while(isDeadBlock(block));
-    char *buffer = evict_block(block.first);
-    if (buffer) {
-      free_buffers.push(buffer);
-    }
-  }
+  void recycle(moodycamel::ConcurrentQueue<char *> &free_buffers);
 
   size_t entry_num() const {
     return entry_num_;
   }
 
- private:
-  Entry *entries_;
-  size_t entry_num_;
-  LRUCache cache_;
-
-  bool isDeadBlock(LRUCache::BlockType block) {
+  bool isDeadBlock(LRUCache::BlockType block) const {
     Entry &entry = entries_[block.first];
-    return block.second == entry.load_count.load();
-  }
-};
-
-class VecBufferPool;
-
-struct VecBufferPoolHandle {
-  VecBufferPoolHandle(VecBufferPool &pool);
-  VecBufferPoolHandle(VecBufferPoolHandle &&other)
-      : pool(other.pool),
-        local_cache(std::move(other.local_cache)),
-        hit_num_(other.hit_num_) {
-    other.local_cache.clear();
-    other.hit_num_ = 0;
+    return block.second != entry.load_count.load();
   }
-  ~VecBufferPoolHandle();
 
-  char *get_block(size_t offset, size_t size, size_t block_id);
-
-  int get_meta(size_t offset, size_t length, char *buffer);
-
-  void release_all();
-
-  VecBufferPool &pool;
-#ifdef USE_LOCAL_CACHE
-  // std::unordered_map<block_id_t, char*> local_cache;
-  phmap::flat_hash_map<block_id_t, char *> local_cache;
-#else
-  std::vector<block_id_t> local_cache;
-#endif
-  int hit_num_;
+ private:
+  size_t entry_num_{0};
+  Entry *entries_{nullptr};
+  LRUCache cache_;
 };
 
+class VecBufferPoolHandle;
+
 class VecBufferPool {
  public:
-  VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size)
-      : pool_capacity_(pool_capacity) {
-    fd_ = open(filename.c_str(), O_RDONLY);
-    if (fd_ < 0) {
-      throw std::runtime_error("Failed to open file: " + filename);
-    }
-    struct stat st;
-    if (fstat(fd_, &st) < 0) {
-      throw std::runtime_error("Failed to stat file: " + filename);
-    }
-    file_size_ = st.st_size;
-
-    size_t buffer_num = pool_capacity_ / block_size;
-    lp_map_.init(buffer_num);
-    for (size_t i = 0; i < buffer_num; i++) {
-      char *buffer = (char *)aligned_alloc(64, block_size);
-      free_buffers_.push(buffer);
-    }
-    std::cout << "buffer_num: " << buffer_num << std::endl;
-    std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
-  }
+  typedef std::shared_ptr<VecBufferPool> Pointer;
+  
+  VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size);
   ~VecBufferPool() {
     close(fd_);
   }
 
-  VecBufferPoolHandle get_handle() {
-    return VecBufferPoolHandle(*this);
-  }
+  VecBufferPoolHandle get_handle();
 
-  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry = 0) {
-    char *buffer = lp_map_.acquire_block(block_id);
-    if (buffer) {
-      return buffer;
-    }
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      if (free_buffers_.empty()) {
-        for (int i = 0; i < retry; i++) {
-          lp_map_.recycle(free_buffers_);
-          if (!free_buffers_.empty()) {
-            break;
-          }
-        }
-      }
-      if (free_buffers_.empty()) {
-        return nullptr;
-      }
-      buffer = free_buffers_.front();
-      free_buffers_.pop();
-    }
-
-    ssize_t read_bytes = pread(fd_, buffer, size, offset);
-    if (read_bytes != static_cast<ssize_t>(size)) {
-      std::cerr << "Failed to read file at offset " << offset << std::endl;
-      exit(-1);
-    }
-
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      char *placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
-      if (placed_buffer != buffer) {
-        // another thread has set the block
-        free_buffers_.push(buffer);
-      }
-      return placed_buffer;
-    }
-  }
+  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry = 0);
 
-  int get_meta(size_t offset, size_t length, char *buffer) {
-    ssize_t read_bytes = pread(fd_, buffer, length, offset);
-    if (read_bytes != static_cast<ssize_t>(length)) {
-      std::cerr << "Failed to read file at offset " << offset << std::endl;
-      exit(-1);
-    }
-    return 0;
-  }
+  int get_meta(size_t offset, size_t length, char *buffer);
 
   size_t file_size() const {
     return file_size_;
@@ -282,86 +121,32 @@ class VecBufferPool {
 
  private:
   std::mutex mutex_;
-  std::queue<char *> free_buffers_;
+  moodycamel::ConcurrentQueue<char *> free_buffers_;
 };
 
-
-struct Counter {
-  ~Counter() = default;
-
-  static Counter &get_instance() {
-    static Counter instance;
-    return instance;
-  }
-
-  void record(const std::string &name, int64_t value) {
-    auto it = static_counters.find(name);
-    if (it == static_counters.end()) {
-      auto counter = std::make_unique<std::atomic<int64_t>>(0);
-      it = static_counters.emplace(name, std::move(counter)).first;
-    }
-    it->second->fetch_add(value);
+struct VecBufferPoolHandle {
+  VecBufferPoolHandle(VecBufferPool &pool) : pool(pool), hit_num_(0) {};
+  VecBufferPoolHandle(VecBufferPoolHandle &&other)
+      : pool(other.pool),
+        hit_num_(other.hit_num_) {
+    other.hit_num_ = 0;
   }
+    
+  ~VecBufferPoolHandle() = default;
 
-  void display() {
-    for (const auto &pair : static_counters) {
-      std::cout << pair.first << ": " << pair.second->load() << std::endl;
-    }
-  }
+  typedef std::shared_ptr<VecBufferPoolHandle> Pointer;
 
-  void clear() {
-    static_counters.clear();
-  }
+  char *get_block(size_t offset, size_t size, size_t block_id);
 
- private:
-  Counter() {}
-  std::map<std::string, std::unique_ptr<std::atomic<int64_t>>> static_counters;
-};
+  int get_meta(size_t offset, size_t length, char *buffer);
 
-VecBufferPoolHandle::VecBufferPoolHandle(VecBufferPool &pool)
-    : pool(pool), hit_num_(0) {}
-VecBufferPoolHandle::~VecBufferPoolHandle() {
-  Counter::get_instance().record("buffer_pool_handle_hit_num", hit_num_);
-  release_all();
-}
-
-char *VecBufferPoolHandle::get_block(size_t offset, size_t size, size_t block_id) {
-#ifdef USE_LOCAL_CACHE
-  auto it = local_cache.find(block_id);
-  if (it != local_cache.end()) {
-    hit_num_++;
-    return it->second;
-  }
-#endif
-
-  char *buffer = pool.acquire_buffer(block_id, offset, size, 3);
-  if (buffer) {
-#ifdef USE_LOCAL_CACHE
-    local_cache[block_id] = buffer;
-#else
-    local_cache.push_back(block_id);
-#endif
-    return buffer;
-  }
+  void release_one(block_id_t block_id);
 
-  return nullptr;
-}
+  void acquire_one(block_id_t block_id);
 
-int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *out) {
-  return pool.get_meta(offset, length, out);
-}
+  VecBufferPool &pool;
+  int hit_num_;
+};
 
-void VecBufferPoolHandle::release_all() {
-#ifdef USE_LOCAL_CACHE
-  Counter::get_instance().record("buffer_pool_handle_release_call",
-                                 local_cache.size());
-  for (const auto &pair : local_cache) {
-    pool.lp_map_.release_block(pair.first);
-  }
-#else
-  for (block_id_t block_id : local_cache) {
-    pool.lp_map_.release_block(block_id);
-  }
-#endif
-  local_cache.clear();
-}
\ No newline at end of file
+}  // namespace ailego
+}  // namespace zvec
\ No newline at end of file
diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h
index 8673d63e..346b8da4 100644
--- a/src/include/zvec/core/framework/index_storage.h
+++ b/src/include/zvec/core/framework/index_storage.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
 #include <zvec/ailego/container/params.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_module.h>
@@ -37,10 +37,11 @@ class IndexStorage : public IndexModule {
     };
 
     MemoryBlock() {}
-    MemoryBlock(ailego::BufferHandle::Pointer &&buffer_handle)
-        : type_(MemoryBlockType::MBT_BUFFERPOOL),
-          buffer_handle_(std::move(buffer_handle)) {
-      data_ = buffer_handle_->pin_vector_data();
+    MemoryBlock(ailego::VecBufferPoolHandle* buffer_pool_handle, int block_id, void *data)
+        : type_(MemoryBlockType::MBT_BUFFERPOOL) {
+      buffer_pool_handle_ = buffer_pool_handle;
+      buffer_block_id_ = block_id;
+      data_ = data;
     }
     MemoryBlock(void *data) : type_(MemoryBlockType::MBT_MMAP), data_(data) {}
 
@@ -50,7 +51,8 @@ class IndexStorage : public IndexModule {
           this->reset(rhs.data_);
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          this->reset(rhs.buffer_handle_);
+          this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_);
+          buffer_pool_handle_->acquire_one(buffer_block_id_);
           break;
         default:
           break;
@@ -63,7 +65,7 @@ class IndexStorage : public IndexModule {
           this->reset(std::move(rhs.data_));
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          this->reset(std::move(rhs.buffer_handle_));
+          this->reset(std::move(rhs.buffer_pool_handle_), std::move(rhs.buffer_block_id_), std::move(rhs.data_));
           break;
         default:
           break;
@@ -77,7 +79,8 @@ class IndexStorage : public IndexModule {
             this->reset(rhs.data_);
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(rhs.buffer_handle_);
+            this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_);
+            buffer_pool_handle_->acquire_one(buffer_block_id_);
             break;
           default:
             break;
@@ -93,7 +96,7 @@ class IndexStorage : public IndexModule {
             this->reset(std::move(rhs.data_));
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(std::move(rhs.buffer_handle_));
+            this->reset(std::move(rhs.buffer_pool_handle_), std::move(rhs.buffer_block_id_), std::move(rhs.data_));
             break;
           default:
             break;
@@ -107,9 +110,8 @@ class IndexStorage : public IndexModule {
         case MemoryBlockType::MBT_MMAP:
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          if (buffer_handle_) {
-            buffer_handle_->unpin_vector_data();
-            // buffer_handle_.reset();
+          if (buffer_pool_handle_) {
+            buffer_pool_handle_->release_one(buffer_block_id_);
           }
           break;
         default:
@@ -122,34 +124,20 @@ class IndexStorage : public IndexModule {
       return data_;
     }
 
-    void reset(ailego::BufferHandle::Pointer &buffer_handle) {
+    void reset(ailego::VecBufferPoolHandle* buffer_pool_handle, int block_id, void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_handle_->unpin_vector_data();
-        buffer_handle_.reset();
+        buffer_pool_handle->release_one(buffer_block_id_);
       }
       type_ = MemoryBlockType::MBT_BUFFERPOOL;
-      if (buffer_handle) {
-        buffer_handle_.reset(buffer_handle.release());
-      }
-      data_ = buffer_handle_->pin_vector_data();
-    }
-
-    void reset(ailego::BufferHandle::Pointer &&buffer_handle) {
-      if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_handle_->unpin_vector_data();
-        buffer_handle_.reset();
-      }
-      type_ = MemoryBlockType::MBT_BUFFERPOOL;
-      if (buffer_handle) {
-        buffer_handle_ = std::move(buffer_handle);
-      }
-      data_ = buffer_handle_->pin_vector_data();
+      buffer_pool_handle_ = buffer_pool_handle;
+      buffer_block_id_ = block_id;
+      data_ = data;
     }
 
     void reset(void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_handle_->unpin_vector_data();
-        buffer_handle_.reset();
+        buffer_pool_handle_->release_one(buffer_block_id_);
+        buffer_pool_handle_ = nullptr;
       }
       type_ = MemoryBlockType::MBT_MMAP;
       data_ = data;
@@ -157,7 +145,8 @@ class IndexStorage : public IndexModule {
 
     MemoryBlockType type_{MBT_UNKNOWN};
     void *data_{nullptr};
-    mutable ailego::BufferHandle::Pointer buffer_handle_{nullptr};
+    mutable ailego::VecBufferPoolHandle* buffer_pool_handle_;
+    int buffer_block_id_{0};
   };
 
   struct SegmentData {
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
index 62b25e23..fbc404b4 100644
--- a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
+++ b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
@@ -50,7 +50,6 @@ void FlatStreamerTest::TearDown(void) {
 }
 
 TEST_F(FlatStreamerTest, TestLinearSearch) {
-  BufferManager::Instance().init(300 * 1024 / 2 * 1024, 1);
   IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
   ASSERT_TRUE(write_streamer != nullptr);
@@ -165,31 +164,33 @@ TEST_F(FlatStreamerTest, TestLinearSearch) {
     ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
     ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 
   read_streamer->close();
   read_streamer.reset();
-  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
-TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
-  BufferManager::Instance().init(3 * 1024 / 2 * 1024, 1);
+TEST_F(FlatStreamerTest, TestLinearSearchWithLRU) {
+  constexpr size_t static dim = 1600;
   IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
   ASSERT_TRUE(write_streamer != nullptr);
 
   Params params;
-  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  IndexMeta meta = IndexMeta(IndexMeta::DataType::DT_FP32, dim);
+  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_EQ(0, write_streamer->init(meta, params));
   auto storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, storage);
   Params stg_params;
   ASSERT_EQ(0, storage->init(stg_params));
-  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchMMap", true));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchWithLRU", true));
   ASSERT_EQ(0, write_streamer->open(storage));
 
   auto ctx = write_streamer->create_context();
   ASSERT_TRUE(!!ctx);
 
-  size_t cnt = 10000UL;
+  size_t cnt = 1000000UL;
   IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
   for (size_t i = 0; i < cnt; i++) {
     NumericalVector<float> vec(dim);
@@ -202,18 +203,19 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
   write_streamer->close();
   write_streamer.reset();
 
-  ElapsedTime elapsed_time;
+
   IndexStreamer::Pointer read_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
-  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_EQ(0, read_streamer->init(meta, params));
+  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
   ASSERT_NE(nullptr, read_storage);
   ASSERT_EQ(0, read_storage->init(stg_params));
-  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchWithLRU", false));
   ASSERT_EQ(0, read_streamer->open(read_storage));
   size_t topk = 3;
   auto provider = read_streamer->create_provider();
-  for (size_t i = 0; i < cnt; i += 1) {
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < 10; i += 1) {
     NumericalVector<float> vec(dim);
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i;
@@ -241,122 +243,132 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
     ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
     ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
-
-  ctx->set_topk(100U);
-  NumericalVector<float> vec(dim);
-  for (size_t j = 0; j < dim; ++j) {
-    vec[j] = 10.1f;
-  }
-  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
-  auto &result = ctx->result();
-  ASSERT_EQ(100U, result.size());
-  ASSERT_EQ(10, result[0].key());
-  ASSERT_EQ(11, result[1].key());
-  ASSERT_EQ(5, result[10].key());
-  ASSERT_EQ(0, result[20].key());
-  ASSERT_EQ(30, result[30].key());
-  ASSERT_EQ(35, result[35].key());
-  ASSERT_EQ(99, result[99].key());
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 
   read_streamer->close();
   read_streamer.reset();
-  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
-TEST_F(FlatStreamerTest, TestBufferStorage) {
-  BufferManager::Instance().init(10 * 1024 * 1024, 1);
-  IndexStreamer::Pointer streamer =
+TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
+  IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_TRUE(streamer != nullptr);
-  const int dim = 16;
-  IndexMeta meta = IndexMeta(IndexMeta::DT_FP32, dim);
-  meta.set_metric("SquaredEuclidean", 0, Params());
+  ASSERT_TRUE(write_streamer != nullptr);
 
   Params params;
-  EXPECT_EQ(0, streamer->init(meta, params));
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
   auto storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, storage);
   Params stg_params;
-  EXPECT_EQ(0, storage->init(stg_params));
-  EXPECT_EQ(0, storage->open(dir_ + "/Test/LinearSearch", true));
-  EXPECT_EQ(0, streamer->open(storage));
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchMMap", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
 
-  auto ctx = streamer->create_context();
+  auto ctx = write_streamer->create_context();
   ASSERT_TRUE(!!ctx);
 
-  size_t cnt = 1000UL;
+  size_t cnt = 10000UL;
   IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
   for (size_t i = 0; i < cnt; i++) {
     NumericalVector<float> vec(dim);
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i;
     }
-    streamer->add_impl(i, vec.data(), qmeta, ctx);
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
   }
-  streamer->flush(0UL);
-  streamer.reset();
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
 
   IndexStreamer::Pointer read_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
-  ASSERT_TRUE(read_streamer != nullptr);
-  EXPECT_EQ(0, read_streamer->init(meta, params));
-  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, read_storage);
-  EXPECT_EQ(0, read_storage->init(stg_params));
-  EXPECT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearch", false));
-  EXPECT_EQ(0, read_streamer->open(read_storage));
-  auto read_ctx = read_streamer->create_context();
-  auto provider = read_streamer->create_provider();
-
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
   size_t topk = 3;
+  auto provider = read_streamer->create_provider();
   for (size_t i = 0; i < cnt; i += 1) {
     NumericalVector<float> vec(dim);
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i;
     }
-    read_ctx->set_topk(topk);
-    EXPECT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, read_ctx));
-    auto &result1 = read_ctx->result();
-    EXPECT_EQ(topk, result1.size());
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
     for (size_t j = 0; j < dim; ++j) {
-      const float *data = (float *)provider->get_vector(result1[0].key());
-      EXPECT_EQ(data[j], i);
+      ASSERT_EQ(data[j], i);
     }
-    EXPECT_EQ(i, result1[0].key());
+    ASSERT_EQ(i, result1[0].key());
 
     for (size_t j = 0; j < dim; ++j) {
       vec[j] = i + 0.1f;
     }
-    read_ctx->set_topk(topk);
-    EXPECT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, read_ctx));
-    auto &result2 = read_ctx->result();
-    EXPECT_EQ(topk, result2.size());
-    EXPECT_EQ(i, result2[0].key());
-    EXPECT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
-    EXPECT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
 
-  read_ctx->set_topk(100U);
+  ctx->set_topk(100U);
   NumericalVector<float> vec(dim);
   for (size_t j = 0; j < dim; ++j) {
     vec[j] = 10.1f;
   }
-  EXPECT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, read_ctx));
-  auto &result = read_ctx->result();
-  EXPECT_EQ(100U, result.size());
-  EXPECT_EQ(10, result[0].key());
-  EXPECT_EQ(11, result[1].key());
-  EXPECT_EQ(5, result[10].key());
-  EXPECT_EQ(0, result[20].key());
-  EXPECT_EQ(30, result[30].key());
-  EXPECT_EQ(35, result[35].key());
-  EXPECT_EQ(99, result[99].key());
-
-  read_streamer->flush(0UL);
+  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
+  auto &result = ctx->result();
+  ASSERT_EQ(100U, result.size());
+  ASSERT_EQ(10, result[0].key());
+  ASSERT_EQ(11, result[1].key());
+  ASSERT_EQ(5, result[10].key());
+  ASSERT_EQ(0, result[20].key());
+  ASSERT_EQ(30, result[30].key());
+  ASSERT_EQ(35, result[35].key());
+  ASSERT_EQ(99, result[99].key());
+
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+
+  read_streamer->close();
   read_streamer.reset();
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
-
 #if defined(__GNUC__) || defined(__GNUG__)
 #pragma GCC diagnostic pop
 #endif
\ No newline at end of file
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
index c919e9fe..435ecccc 100644
--- a/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
+++ b/tests/core/algorithm/flat/flat_streamer_buffer_time_test.cc
@@ -83,7 +83,7 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
   IndexStreamer::Pointer read_streamer =
       IndexFactory::CreateStreamer("FlatStreamer");
   ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
-  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
   ASSERT_NE(nullptr, read_storage);
   ASSERT_EQ(0, read_storage->init(stg_params));
   ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchMMap", false));
@@ -113,26 +113,121 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
     // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
   }
   cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
 
-  // ctx->set_topk(100U);
-  // NumericalVector<float> vec(dim);
-  // for (size_t j = 0; j < dim; ++j) {
-  //   vec[j] = 10.1f;
-  // }
-  // ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
-  // auto &result = ctx->result();
-  // ASSERT_EQ(100U, result.size());
-  // ASSERT_EQ(10, result[0].key());
-  // ASSERT_EQ(11, result[1].key());
-  // ASSERT_EQ(5, result[10].key());
-  // ASSERT_EQ(0, result[20].key());
-  // ASSERT_EQ(30, result[30].key());
-  // ASSERT_EQ(35, result[35].key());
-  // ASSERT_EQ(99, result[99].key());
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  read_streamer->close();
+  read_streamer.reset();
+}
 
+TEST_F(FlatStreamerTest, TestLinearSearchBuffer) {
+  IndexStreamer::Pointer write_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_TRUE(write_streamer != nullptr);
+
+  Params params;
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  auto storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_NE(nullptr, storage);
+  Params stg_params;
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "/Test/LinearSearchBuffer", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
+
+  auto ctx = write_streamer->create_context();
+  ASSERT_TRUE(!!ctx);
+
+  size_t data_cnt = 300000UL, cnt = 500UL;
+  IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
+  for (size_t i = 0; i < data_cnt; i++) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
+  }
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
+
+  IndexStreamer::Pointer read_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_NE(nullptr, read_storage);
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "/Test/LinearSearchBuffer", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
+  size_t topk = 30;
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
+
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result1 = ctx->result();
+    // ASSERT_EQ(topk, result1.size());
+    // ASSERT_EQ(i, result1[0].key());
+
+    // for (size_t j = 0; j < dim; ++j) {
+    //   vec[j] = i + 0.1f;
+    // }
+    // ctx->set_topk(topk);
+    // ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    // auto &result2 = ctx->result();
+    // ASSERT_EQ(topk, result2.size());
+    // ASSERT_EQ(i, result2[0].key());
+    // ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    // ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.micro_seconds() << " us" << endl;
   read_streamer->close();
   read_streamer.reset();
-  // cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
 #if defined(__GNUC__) || defined(__GNUG__)

From 7df2716d2ac969e665422b2d4a85ae51cc3d47cf Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Mon, 9 Feb 2026 15:33:34 +0800
Subject: [PATCH 04/28] upd buffer pool

---
 src/ailego/buffer/buffer_pool.cc              |  75 +--
 src/core/utility/buffer1_storage.cc           | 438 ------------------
 src/core/utility/buffer_storage.cc            | 130 +++---
 ..._test.cpp => hnsw_streamer_buffer_test.cc} |   0
 4 files changed, 91 insertions(+), 552 deletions(-)
 delete mode 100644 src/core/utility/buffer1_storage.cc
 rename tests/core/algorithm/hnsw/{hnsw_streamer_buffer_test.cpp => hnsw_streamer_buffer_test.cc} (100%)

diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc
index 061ead37..3ed461c1 100644
--- a/src/ailego/buffer/buffer_pool.cc
+++ b/src/ailego/buffer/buffer_pool.cc
@@ -1,23 +1,9 @@
 #include <zvec/ailego/buffer/buffer_pool.h>
+#include <zvec/core/framework/index_logger.h>
 
 namespace zvec {
 namespace ailego {
 
-void Counter::record(const std::string &name, int64_t value) {
-	auto it = static_counters.find(name);
-	if (it == static_counters.end()) {
-			auto counter = std::make_unique<std::atomic<int64_t>>(0);
-			it = static_counters.emplace(name, std::move(counter)).first;
-	}
-	it->second->fetch_add(value);
-}
-
-void Counter::display() {
-	for (const auto &pair : static_counters) {
-		std::cout << pair.first << ": " << pair.second->load() << std::endl;
-	}
-}
-
 int LRUCache::init(size_t block_size) {
 	block_size_ = block_size;
 	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
@@ -27,11 +13,9 @@ int LRUCache::init(size_t block_size) {
 }
 
 bool LRUCache::evict_single_block(BlockType &item) {
-	// std::cerr << "dequeue: " << item.first << std::endl;
 	bool found = false;
 	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
 		found = queues_[i].try_dequeue(item);
-		// std::cerr << "dequeue: " << found << std::endl;
 		if(found) {
 			break;
 		}
@@ -41,7 +25,8 @@ bool LRUCache::evict_single_block(BlockType &item) {
 
 bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block, int block_type) {
 	bool ok = queues_[block_type].try_enqueue(block);
-	if(++evict_queue_insertions_ % block_size_ == 0) {
+	evict_queue_insertions_.fetch_add(1, std::memory_order_relaxed);
+	if(evict_queue_insertions_ % block_size_ == 0) {
 		this->clear_dead_node(lp_map);
 	}
 	return ok;
@@ -49,10 +34,14 @@ bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block, int
 
 void LRUCache::clear_dead_node(const LPMap *lp_map) {
 	for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
+		int clear_size = block_size_ * 2;
+		if (queues_[i].size_approx() < clear_size * 4) {
+			continue;
+		}
 		int clear_count = 0;
 		ConcurrentQueue tmp(block_size_);
 		BlockType item;
-		while(queues_[i].try_dequeue(item) && (clear_count++ < block_size_)) {
+		while(queues_[i].try_dequeue(item) && (clear_count++ < clear_size)) {
 			if(!lp_map->isDeadBlock(item)) {
 				tmp.try_enqueue(item);
 			}
@@ -82,14 +71,11 @@ void LPMap::init(size_t entry_num) {
 char* LPMap::acquire_block(block_id_t block_id) {
 	assert(block_id < entry_num_);
 	Entry &entry = entries_[block_id];
-	if (entry.ref_count.load() == 0) {
-		++entry.load_count;
-		// std::cout << entry.load_count.load() << std::endl;
+	if (entry.ref_count.load(std::memory_order_relaxed) == 0) {
+		entry.load_count.fetch_add(1, std::memory_order_relaxed);
 	}
-	++entry.ref_count;
-	// std::cout << entry.ref_count.load() << std::endl;
-	if (entry.ref_count.load() < 0) {
-		// std::cout << "acquire block failed: " << block_id << ", " << entry.ref_count.load() << std::endl;
+	entry.ref_count.fetch_add(1, std::memory_order_relaxed);
+	if (entry.ref_count.load(std::memory_order_relaxed) < 0) {
 		return nullptr;
 	}
 	return entry.buffer;
@@ -98,10 +84,9 @@ char* LPMap::acquire_block(block_id_t block_id) {
 void LPMap::release_block(block_id_t block_id) {
 	assert(block_id < entry_num_);
 	Entry &entry = entries_[block_id];
-	int rc = entry.ref_count.fetch_sub(1);
-	// std::cout << "release block: " << block_id << ", " << entry.ref_count.load() << std::endl;
-	// assert(rc > 0);
-	if(entry.ref_count.load() == 0) {
+
+	if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) {
+		std::atomic_thread_fence(std::memory_order_acquire);
 		LRUCache::BlockType block;
 		block.first = block_id;
 		block.second = entry.load_count.load();
@@ -110,7 +95,6 @@ void LPMap::release_block(block_id_t block_id) {
 }
 
 char* LPMap::evict_block(block_id_t block_id) {
-	// std::cout << "evict block: " << block_id << std::endl;
 	assert(block_id < entry_num_);
 	Entry &entry = entries_[block_id];
 	int expected = 0;
@@ -127,15 +111,13 @@ char* LPMap::evict_block(block_id_t block_id) {
 char* LPMap::set_block_acquired(block_id_t block_id, char *buffer) {
 	assert(block_id < entry_num_);
 	Entry &entry = entries_[block_id];
-	if (entry.ref_count.load() >= 0) {
-		entry.ref_count.fetch_add(1);
-		// std::cout << "Set block2 " << block_id << std::endl;
+	if (entry.ref_count.load(std::memory_order_relaxed) >= 0) {
+		entry.ref_count.fetch_add(1, std::memory_order_relaxed);
 		return entry.buffer;
 	}
-	// if (buffer == nullptr) std::cout << "Set block " << block_id << std::endl;
 	entry.buffer = buffer;
-	entry.ref_count.store(1);
-	entry.load_count.fetch_add(1);
+	entry.ref_count.store(1, std::memory_order_relaxed);
+	entry.load_count.fetch_add(1, std::memory_order_relaxed);
 	return buffer;
 }
 
@@ -147,7 +129,6 @@ void LPMap::recycle(moodycamel::ConcurrentQueue<char *> &free_buffers) {
 			return;
 		}
 	} while(isDeadBlock(block));
-	// std::cout << "evict_block done: " << block.first << ", " << block.second << std::endl;
 	char *buffer = evict_block(block.first);
 	if (buffer) {
 		free_buffers.try_enqueue(buffer);
@@ -173,11 +154,9 @@ VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
 		char *buffer = (char *)aligned_alloc(64, block_size);
 		if (buffer != nullptr) {
 			bool ok = free_buffers_.try_enqueue(buffer);
-			// if(!ok) std::cerr << i << std::endl;
 		}
 	}
-	std::cout << "buffer_num: " << buffer_num << std::endl;
-	std::cout << "entry_num: " << lp_map_.entry_num() << std::endl;
+	LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num, lp_map_.entry_num());
 }
 
 VecBufferPoolHandle VecBufferPool::get_handle() {
@@ -190,30 +169,26 @@ char* VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, size_t s
 		return buffer;
 	}
 	{
-		// std::cerr << "block_id: " << block_id << ", offset: " << offset << ", size: " << size << std::endl;
-		// std::lock_guard<std::mutex> lock(mutex_);
 		bool found = free_buffers_.try_dequeue(buffer);
-		// std::cerr << "dequeue: " << found << std::endl;
 		if (!found) {
 			for (int i = 0; i < retry; i++) {
 				lp_map_.recycle(free_buffers_);
 				found = free_buffers_.try_dequeue(buffer);
-				// std::cerr << "dequeue: " << i << std::endl;
 				if (found) {
 					break;
 				}
 			}
 		}
 		if (!found) {
-			std::cerr << "Failed to get free buffer " << std::endl;
+			LOG_ERROR("Buffer pool failed to get free buffer");
 			return nullptr;
 		}
 	}
 
 	ssize_t read_bytes = pread(fd_, buffer, size, offset);
 	if (read_bytes != static_cast<ssize_t>(size)) {
-		std::cerr << "Failed to read file at offset " << offset << std::endl;
-		exit(-1);
+		LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+		return nullptr;
 	}
 	char *placed_buffer = nullptr;
 	{
@@ -230,8 +205,8 @@ char* VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, size_t s
 int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
 	ssize_t read_bytes = pread(fd_, buffer, length, offset);
 	if (read_bytes != static_cast<ssize_t>(length)) {
-		std::cerr << "Failed to read file at offset " << offset << std::endl;
-		exit(-1);
+		LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+		return -1;
 	}
 	return 0;
 }
diff --git a/src/core/utility/buffer1_storage.cc b/src/core/utility/buffer1_storage.cc
deleted file mode 100644
index 1c582198..00000000
--- a/src/core/utility/buffer1_storage.cc
+++ /dev/null
@@ -1,438 +0,0 @@
-// Copyright 2025-present the zvec project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <mutex>
-// #include <zvec/ailego/buffer/buffer_manager.h>
-#include <zvec/ailego/buffer/buffer_pool.h>
-#include <zvec/core/framework/index_error.h>
-#include <zvec/core/framework/index_factory.h>
-#include <zvec/core/framework/index_mapping.h>
-#include <zvec/core/framework/index_version.h>
-#include "utility_params.h"
-
-#include <zvec/ailego/utility/time_helper.h>
-
-namespace zvec {
-namespace core {
-
-/*! MMap File Storage
- */
-class Buffer1Storage : public IndexStorage {
- public:
-  /*! Index Storage Segment
-   */
-  class Segment : public IndexStorage::Segment,
-                  public std::enable_shared_from_this<Segment> {
-   public:
-    //! Index Storage Pointer
-    typedef std::shared_ptr<Segment> Pointer;
-
-    //! Constructor
-    Segment(Buffer1Storage *owner, IndexMapping::Segment *segment, size_t segment_id)
-        : segment_(segment),
-          owner_(owner),
-          segment_id_(segment_id),
-          capacity_(static_cast<size_t>(segment->meta()->data_size +
-                                        segment->meta()->padding_size)) {}
-
-    //! Destructor
-    virtual ~Segment(void) {}
-
-    //! Retrieve size of data
-    size_t data_size(void) const override {
-      return static_cast<size_t>(segment_->meta()->data_size);
-    }
-
-    //! Retrieve crc of data
-    uint32_t data_crc(void) const override {
-      return segment_->meta()->data_crc;
-    }
-
-    //! Retrieve size of padding
-    size_t padding_size(void) const override {
-      return static_cast<size_t>(segment_->meta()->padding_size);
-    }
-
-    //! Retrieve capacity of segment
-    size_t capacity(void) const override {
-      return capacity_;
-    }
-
-    //! Fetch data from segment (with own buffer)
-    size_t fetch(size_t offset, void *buf, size_t len) const override {
-      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
-        auto meta = segment_->meta();
-        if (offset > meta->data_size) {
-          offset = meta->data_size;
-        }
-        len = meta->data_size - offset;
-      }
-      memmove(buf, (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) + offset,
-              len);
-      return len;
-    }
-
-    //! Read data from segment
-    size_t read(size_t offset, const void **data, size_t len) override {
-      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
-        auto meta = segment_->meta();
-        if (offset > meta->data_size) {
-          offset = meta->data_size;
-        }
-        len = meta->data_size - offset;
-      }
-      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
-      *data = owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset;
-      return len;
-    }
-
-    size_t read(size_t offset, MemoryBlock &data, size_t len) override {
-      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
-        auto meta = segment_->meta();
-        if (offset > meta->data_size) {
-          offset = meta->data_size;
-        }
-        len = meta->data_size - offset;
-      }
-      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
-      data.reset(owner_->buffer_pool_handle_.get(), segment_id_, owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
-      // data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
-      if (data.data()) {
-        return len;
-      } else {
-        LOG_ERROR("read error.");
-        return -1;
-      }
-    }
-
-    //! Write data into the storage with offset
-    size_t write(size_t /*offset*/, const void * /*data*/,
-                 size_t len) override {
-      return len;
-    }
-
-    //! Resize size of data
-    size_t resize(size_t /*size*/) override {
-      return 0;
-    }
-
-    //! Update crc of data
-    void update_data_crc(uint32_t /*crc*/) override {}
-
-    //! Clone the segment
-    IndexStorage::Segment::Pointer clone(void) override {
-      return shared_from_this();
-    }
-
-   private:
-    IndexMapping::Segment *segment_{};
-    Buffer1Storage *owner_{nullptr};
-    size_t segment_id_{};
-    size_t capacity_{};
-  };
-
-  //! Destructor
-  virtual ~Buffer1Storage(void) {
-    this->cleanup();
-  }
-
-  //! Initialize storage
-  int init(const ailego::Params & /*params*/) override {
-    return 0;
-  }
-
-  //! Cleanup storage
-  int cleanup(void) override {
-    this->close_index();
-    return 0;
-  }
-
-  //! Open storage
-  int open(const std::string &path, bool /*create*/) override {
-    LOG_INFO("open buffer storage 1");
-    file_name_ = path;
-    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path, 10u * 1024 * 1024 * 1024, 2490368 * 2);
-    buffer_pool_handle_ =
-        std::make_shared<ailego::VecBufferPoolHandle>(buffer_pool_->get_handle());
-    int ret = ParseToMapping();
-    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
-    if(ret != 0) {
-      return ret;
-    }
-    return 0;
-  }
-
-  char *get_buffer(size_t offset, size_t length, size_t block_id) {
-    return buffer_pool_handle_->get_block(offset, length, block_id);
-  }
-
-  int get_meta(size_t offset, size_t length, char *out) {
-    return buffer_pool_handle_->get_meta(offset, length, out);
-  }
-
-  int ParseHeader(size_t offset) {
-    char *buffer = new char[sizeof(header_)];
-    get_meta(offset, sizeof(header_), buffer);
-    uint8_t *header_ptr = reinterpret_cast<uint8_t *>(buffer);
-    memcpy(&header_, header_ptr, sizeof(header_));
-    delete[] buffer;
-    if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) {
-      LOG_ERROR("Header meta size is invalid.");
-      return IndexError_InvalidLength;
-    }
-    if (ailego::Crc32c::Hash(&header_, sizeof(header_), header_.header_crc) !=
-        header_.header_crc) {
-      LOG_ERROR("Header meta checksum is invalid.");
-      return IndexError_InvalidChecksum;
-    }
-    return 0;
-  }
-
-  int ParseFooter(size_t offset) {
-    char *buffer = new char[sizeof(footer_)];
-    get_meta(offset, sizeof(footer_), buffer);
-    uint8_t *footer_ptr = reinterpret_cast<uint8_t *>(buffer);
-    memcpy(&footer_, footer_ptr, sizeof(footer_));
-    delete[] buffer;
-    if (offset < (size_t)footer_.segments_meta_size) {
-      LOG_ERROR("Footer meta size is invalid.");
-      return IndexError_InvalidLength;
-    }
-    if (ailego::Crc32c::Hash(&footer_, sizeof(footer_), footer_.footer_crc) !=
-        footer_.footer_crc) {
-      LOG_ERROR("Footer meta checksum is invalid.");
-      return IndexError_InvalidChecksum;
-    }
-    return 0;
-  }
-
-  int ParseSegment(size_t offset) {
-    segment_buffer_ = std::make_unique<char[]>(footer_.segments_meta_size);
-    get_meta(offset, footer_.segments_meta_size, segment_buffer_.get());
-    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size, 0u) !=
-        footer_.segments_meta_crc) {
-      LOG_ERROR("Index segments meta checksum is invalid.");
-      return IndexError_InvalidChecksum;
-    }
-    IndexFormat::SegmentMeta *segment_start =
-        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer_.get());
-    uint32_t segment_ids_offset = footer_.segments_meta_size;
-    for (IndexFormat::SegmentMeta *iter = segment_start,
-                                  *end = segment_start + footer_.segment_count;
-         iter != end; ++iter) {
-      if (iter->segment_id_offset > footer_.segments_meta_size) {
-        return IndexError_InvalidValue;
-      }
-      if (iter->data_index > footer_.content_size) {
-        return IndexError_InvalidValue;
-      }
-      if (iter->data_index + iter->data_size > footer_.content_size) {
-        return IndexError_InvalidLength;
-      }
-
-      if (iter->segment_id_offset < segment_ids_offset) {
-        segment_ids_offset = iter->segment_id_offset;
-      }
-      id_hash_.emplace(
-          std::string(reinterpret_cast<const char *>(segment_start) +
-                      iter->segment_id_offset),
-          segments_.size());
-      segments_.emplace(
-          std::string(reinterpret_cast<const char *>(segment_start) +
-                      iter->segment_id_offset),
-          iter);
-      max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size);
-      if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
-          footer_.segments_meta_size) {
-        return IndexError_InvalidLength;
-      }
-    }
-    return 0;
-  }
-
-  int ParseToMapping() {
-    ParseHeader(0);
-    // Unpack footer
-    if (header_.meta_footer_size != sizeof(IndexFormat::MetaFooter)) {
-      return IndexError_InvalidLength;
-    }
-    if ((int32_t)header_.meta_footer_offset < 0) {
-      return IndexError_Unsupported;
-    }
-    size_t footer_offset = header_.meta_footer_offset;
-    ParseFooter(footer_offset);
-
-    // Unpack segment table
-    if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
-        footer_.segments_meta_size) {
-      return IndexError_InvalidLength;
-    }
-    const size_t segment_start_offset = footer_offset - footer_.segments_meta_size;
-    ParseSegment(segment_start_offset);
-    return 0;
-  }
-
-  //! Flush storage
-  int flush(void) override {
-    return this->flush_index();
-  }
-
-  //! Close storage
-  int close(void) override {
-    this->close_index();
-    return 0;
-  }
-
-  //! Append a segment into storage
-  int append(const std::string &id, size_t size) override {
-    return this->append_segment(id, size);
-  }
-
-  //! Refresh meta information (checksum, update time, etc.)
-  void refresh(uint64_t chkp) override {
-    this->refresh_index(chkp);
-  }
-
-  //! Retrieve check point of storage
-  uint64_t check_point(void) const override {
-    return footer_.check_point;
-  }
-
-  //! Retrieve a segment by id
-  IndexStorage::Segment::Pointer get(const std::string &id, int) override {
-    IndexMapping::Segment *segment = this->get_segment(id);
-    if (!segment) {
-      return Buffer1Storage::Segment::Pointer();
-    }
-    return std::make_shared<Buffer1Storage::Segment>(this, segment,
-                                                     id_hash_[id]);
-  }
-
-  //! Test if it a segment exists
-  bool has(const std::string &id) const override {
-    return this->has_segment(id);
-  }
-
-  //! Retrieve magic number of index
-  uint32_t magic(void) const override {
-    return header_.magic;
-  }
-
-  uint32_t get_context_offset() {
-    return header_.content_offset;
-  }
-
- protected:
-  //! Initialize index version segment
-  int init_version_segment(void) {
-    size_t data_size = std::strlen(IndexVersion::Details());
-    int error_code =
-        this->append_segment(INDEX_VERSION_SEGMENT_NAME, data_size);
-    if (error_code != 0) {
-      return error_code;
-    }
-
-    IndexMapping::Segment *segment = get_segment(INDEX_VERSION_SEGMENT_NAME);
-    if (!segment) {
-      return IndexError_MMapFile;
-    }
-    auto meta = segment->meta();
-    size_t capacity = static_cast<size_t>(meta->padding_size + meta->data_size);
-    memcpy(segment->data(), IndexVersion::Details(), data_size);
-    segment->set_dirty();
-    meta->data_crc = ailego::Crc32c::Hash(segment->data(), data_size, 0);
-    meta->data_size = data_size;
-    meta->padding_size = capacity - data_size;
-    return 0;
-  }
-
-  //! Initialize index file
-  int init_index(const std::string &path) {
-    // Add index version
-    int error_code = this->init_version_segment();
-    if (error_code != 0) {
-      return error_code;
-    }
-
-    // Refresh mapping
-    this->refresh_index(0);
-    return 0;
-  }
-
-  //! Set the index file as dirty
-  void set_as_dirty(void) {
-    index_dirty_ = true;
-  }
-
-  //! Refresh meta information (checksum, update time, etc.)
-  void refresh_index(uint64_t /*chkp*/) {}
-
-  //! Flush index storage
-  int flush_index(void) {
-    return 0;
-  }
-
-  //! Close index storage
-  void close_index(void) {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
-    file_name_.clear();
-    segments_.clear();
-    memset(&header_, 0, sizeof(header_));
-    memset(&footer_, 0, sizeof(footer_));
-    segment_buffer_.release();
-  }
-
-  //! Append a segment into storage
-  int append_segment(const std::string & /*id*/, size_t /*size*/) {
-    return 0;
-  }
-
-  //! Test if a segment exists
-  bool has_segment(const std::string &id) const {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
-    return (segments_.find(id) != segments_.end());
-  }
-
-  //! Get a segment from storage
-  IndexMapping::Segment *get_segment(const std::string &id) {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
-    auto iter = segments_.find(id);
-    if (iter == segments_.end()) {
-      return nullptr;
-    }
-    IndexMapping::Segment *item = &iter->second;
-    return item;
-  }
-
- private:
-  bool index_dirty_{false};
-  mutable std::mutex mapping_mutex_{};
-
-  // buffer manager
-  std::string file_name_;
-  IndexFormat::MetaHeader header_;
-  IndexFormat::MetaFooter footer_;
-  std::map<std::string, IndexMapping::Segment> segments_{};
-  std::map<std::string, size_t> id_hash_{};
-  size_t max_segment_size_{0};
-  std::unique_ptr<char[]> segment_buffer_{nullptr};
-
-  ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
-  ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr};
-};
-
-INDEX_FACTORY_REGISTER_STORAGE_ALIAS(BufferStorage, Buffer1Storage);
-
-}  // namespace core
-}  // namespace zvec
\ No newline at end of file
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index d4b23c87..13aee16a 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -13,13 +13,16 @@
 // limitations under the License.
 
 #include <mutex>
-#include <zvec/ailego/buffer/buffer_manager.h>
+// #include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/ailego/buffer/buffer_pool.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_factory.h>
 #include <zvec/core/framework/index_mapping.h>
 #include <zvec/core/framework/index_version.h>
 #include "utility_params.h"
 
+#include <zvec/ailego/utility/time_helper.h>
+
 namespace zvec {
 namespace core {
 
@@ -36,9 +39,10 @@ class BufferStorage : public IndexStorage {
     typedef std::shared_ptr<Segment> Pointer;
 
     //! Constructor
-    Segment(BufferStorage *owner, IndexMapping::Segment *segment)
+    Segment(BufferStorage *owner, IndexMapping::Segment *segment, size_t segment_id)
         : segment_(segment),
           owner_(owner),
+          segment_id_(segment_id),
           capacity_(static_cast<size_t>(segment->meta()->data_size +
                                         segment->meta()->padding_size)) {}
 
@@ -74,9 +78,7 @@ class BufferStorage : public IndexStorage {
         }
         len = meta->data_size - offset;
       }
-      ailego::BufferHandle buffer_handle =
-          owner_->get_buffer_handle(offset, len);
-      memmove(buf, (const uint8_t *)buffer_handle.pin_vector_data() + offset,
+      memmove(buf, (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) + offset,
               len);
       return len;
     }
@@ -90,11 +92,8 @@ class BufferStorage : public IndexStorage {
         }
         len = meta->data_size - offset;
       }
-      size_t buffer_offset =
-          segment_->meta()->data_index + owner_->get_context_offset() + offset;
-      ailego::BufferHandle buffer_handle =
-          owner_->get_buffer_handle(buffer_offset, len);
-      *data = buffer_handle.pin_vector_data();
+      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
+      *data = owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset;
       return len;
     }
 
@@ -106,16 +105,13 @@ class BufferStorage : public IndexStorage {
         }
         len = meta->data_size - offset;
       }
-      size_t buffer_offset =
-          segment_->meta()->data_index + owner_->get_context_offset() + offset;
-      data.reset(owner_->get_buffer_handle_ptr(buffer_offset, len));
+      size_t segment_offset = segment_->meta()->data_index + owner_->get_context_offset();
+      data.reset(owner_->buffer_pool_handle_.get(), segment_id_, owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
+      // data.reset(owner_->get_buffer(segment_offset, capacity_, segment_id_) + offset);
       if (data.data()) {
         return len;
       } else {
-        LOG_ERROR(
-            "Buffer handle is null, now used memory: %zu, new: %zu",
-            (size_t)ailego::BufferManager::Instance().total_size_in_bytes(),
-            len);
+        LOG_ERROR("read error.");
         return -1;
       }
     }
@@ -142,6 +138,7 @@ class BufferStorage : public IndexStorage {
    private:
     IndexMapping::Segment *segment_{};
     BufferStorage *owner_{nullptr};
+    size_t segment_id_{};
     size_t capacity_{};
   };
 
@@ -163,29 +160,39 @@ class BufferStorage : public IndexStorage {
 
   //! Open storage
   int open(const std::string &path, bool /*create*/) override {
+    LOG_INFO("open buffer storage 1");
     file_name_ = path;
-    return ParseToMapping();
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path, 20lu * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_handle_ =
+        std::make_shared<ailego::VecBufferPoolHandle>(buffer_pool_->get_handle());
+    int ret = ParseToMapping();
+    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
+    for(auto iter = segments_.begin(); iter != segments_.end(); iter++) {
+      auto seg = this->get(iter->first, 0);
+      MemoryBlock block;
+      int len = seg->read(0, block, 1);
+      LOG_ERROR("segment %s: %d", iter->first.c_str(), len);
+    }
+    if(ret != 0) {
+      return ret;
+    }
+    return 0;
   }
 
-  ailego::BufferHandle get_buffer_handle(int offset, int length) {
-    ailego::BufferID buffer_id =
-        ailego::BufferID::VectorID(file_name_, offset, length);
-    return ailego::BufferManager::Instance().acquire(buffer_id);
+  char *get_buffer(size_t offset, size_t length, size_t block_id) {
+    return buffer_pool_handle_->get_block(offset, length, block_id);
   }
 
-  ailego::BufferHandle::Pointer get_buffer_handle_ptr(int offset, int length) {
-    ailego::BufferID buffer_id =
-        ailego::BufferID::VectorID(file_name_, offset, length);
-    return ailego::BufferManager::Instance().acquire_ptr(buffer_id);
+  int get_meta(size_t offset, size_t length, char *out) {
+    return buffer_pool_handle_->get_meta(offset, length, out);
   }
 
-  int ParseHeader(int offset) {
-    ailego::BufferHandle header_handle =
-        get_buffer_handle(offset, sizeof(header_));
-    void *buffer = header_handle.pin_vector_data();
+  int ParseHeader(size_t offset) {
+    char *buffer = new char[sizeof(header_)];
+    get_meta(offset, sizeof(header_), buffer);
     uint8_t *header_ptr = reinterpret_cast<uint8_t *>(buffer);
     memcpy(&header_, header_ptr, sizeof(header_));
-    header_handle.unpin_vector_data();
+    delete[] buffer;
     if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) {
       LOG_ERROR("Header meta size is invalid.");
       return IndexError_InvalidLength;
@@ -198,14 +205,13 @@ class BufferStorage : public IndexStorage {
     return 0;
   }
 
-  int ParseFooter(int offset) {
-    ailego::BufferHandle footer_handle =
-        get_buffer_handle(offset, sizeof(footer_));
-    void *buffer = footer_handle.pin_vector_data();
+  int ParseFooter(size_t offset) {
+    char *buffer = new char[sizeof(footer_)];
+    get_meta(offset, sizeof(footer_), buffer);
     uint8_t *footer_ptr = reinterpret_cast<uint8_t *>(buffer);
     memcpy(&footer_, footer_ptr, sizeof(footer_));
-    footer_handle.unpin_vector_data();
-    if (offset < (int)footer_.segments_meta_size) {
+    delete[] buffer;
+    if (offset < (size_t)footer_.segments_meta_size) {
       LOG_ERROR("Footer meta size is invalid.");
       return IndexError_InvalidLength;
     }
@@ -217,17 +223,16 @@ class BufferStorage : public IndexStorage {
     return 0;
   }
 
-  int ParseSegment(int offset) {
-    ailego::BufferHandle segment_start_handle =
-        get_buffer_handle(offset, footer_.segments_meta_size);
-    void *segment_buffer = segment_start_handle.pin_vector_data();
-    if (ailego::Crc32c::Hash(segment_buffer, footer_.segments_meta_size, 0u) !=
+  int ParseSegment(size_t offset) {
+    segment_buffer_ = std::make_unique<char[]>(footer_.segments_meta_size);
+    get_meta(offset, footer_.segments_meta_size, segment_buffer_.get());
+    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size, 0u) !=
         footer_.segments_meta_crc) {
       LOG_ERROR("Index segments meta checksum is invalid.");
       return IndexError_InvalidChecksum;
     }
     IndexFormat::SegmentMeta *segment_start =
-        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer);
+        reinterpret_cast<IndexFormat::SegmentMeta *>(segment_buffer_.get());
     uint32_t segment_ids_offset = footer_.segments_meta_size;
     for (IndexFormat::SegmentMeta *iter = segment_start,
                                   *end = segment_start + footer_.segment_count;
@@ -245,10 +250,15 @@ class BufferStorage : public IndexStorage {
       if (iter->segment_id_offset < segment_ids_offset) {
         segment_ids_offset = iter->segment_id_offset;
       }
+      id_hash_.emplace(
+          std::string(reinterpret_cast<const char *>(segment_start) +
+                      iter->segment_id_offset),
+          segments_.size());
       segments_.emplace(
           std::string(reinterpret_cast<const char *>(segment_start) +
                       iter->segment_id_offset),
           iter);
+      max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size);
       if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
           footer_.segments_meta_size) {
         return IndexError_InvalidLength;
@@ -259,7 +269,6 @@ class BufferStorage : public IndexStorage {
 
   int ParseToMapping() {
     ParseHeader(0);
-
     // Unpack footer
     if (header_.meta_footer_size != sizeof(IndexFormat::MetaFooter)) {
       return IndexError_InvalidLength;
@@ -275,7 +284,7 @@ class BufferStorage : public IndexStorage {
         footer_.segments_meta_size) {
       return IndexError_InvalidLength;
     }
-    const int segment_start_offset = footer_offset - footer_.segments_meta_size;
+    const size_t segment_start_offset = footer_offset - footer_.segments_meta_size;
     ParseSegment(segment_start_offset);
     return 0;
   }
@@ -312,7 +321,8 @@ class BufferStorage : public IndexStorage {
     if (!segment) {
       return BufferStorage::Segment::Pointer();
     }
-    return std::make_shared<BufferStorage::Segment>(this, segment);
+    return std::make_shared<BufferStorage::Segment>(this, segment,
+                                                     id_hash_[id]);
   }
 
   //! Test if it a segment exists
@@ -355,22 +365,14 @@ class BufferStorage : public IndexStorage {
 
   //! Initialize index file
   int init_index(const std::string &path) {
-    int error_code = mapping_.create(path, segment_meta_capacity_);
-    if (error_code != 0) {
-      return error_code;
-    }
-
     // Add index version
-    error_code = this->init_version_segment();
+    int error_code = this->init_version_segment();
     if (error_code != 0) {
       return error_code;
     }
 
     // Refresh mapping
     this->refresh_index(0);
-
-    // Close mapping
-    mapping_.close();
     return 0;
   }
 
@@ -394,6 +396,7 @@ class BufferStorage : public IndexStorage {
     segments_.clear();
     memset(&header_, 0, sizeof(header_));
     memset(&footer_, 0, sizeof(footer_));
+    segment_buffer_.release();
   }
 
   //! Append a segment into storage
@@ -419,14 +422,7 @@ class BufferStorage : public IndexStorage {
   }
 
  private:
-  // mmap
-  uint32_t segment_meta_capacity_{1024 * 1024};
-  // bool copy_on_write_{false};
-  // bool force_flush_{false};
-  // bool memory_locked_{false};
-  // bool memory_warmup_{false};
   bool index_dirty_{false};
-  mutable IndexMapping mapping_{};
   mutable std::mutex mapping_mutex_{};
 
   // buffer manager
@@ -434,9 +430,15 @@ class BufferStorage : public IndexStorage {
   IndexFormat::MetaHeader header_;
   IndexFormat::MetaFooter footer_;
   std::map<std::string, IndexMapping::Segment> segments_{};
+  std::map<std::string, size_t> id_hash_{};
+  size_t max_segment_size_{0};
+  std::unique_ptr<char[]> segment_buffer_{nullptr};
+
+  ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
+  ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr};
 };
 
-// INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
+INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
 
 }  // namespace core
-}  // namespace zvec
+}  // namespace zvec
\ No newline at end of file
diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cpp b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc
similarity index 100%
rename from tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cpp
rename to tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc

From 11a0e475d154a57e54f23ae7791352db08e48d34 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Mon, 9 Feb 2026 19:10:42 +0800
Subject: [PATCH 05/28] clang format

---
 src/ailego/buffer/buffer_pool.cc              |  384 +-
 src/core/utility/buffer_storage.cc            |   38 +-
 src/include/zvec/ailego/buffer/buffer_pool.h  |   40 +-
 .../zvec/ailego/buffer/concurrentqueue.h      | 7693 +++++++++--------
 4 files changed, 4418 insertions(+), 3737 deletions(-)

diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc
index 3ed461c1..81ed92bf 100644
--- a/src/ailego/buffer/buffer_pool.cc
+++ b/src/ailego/buffer/buffer_pool.cc
@@ -5,227 +5,233 @@ namespace zvec {
 namespace ailego {
 
 int LRUCache::init(size_t block_size) {
-	block_size_ = block_size;
-	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
-		queues_.push_back(ConcurrentQueue(block_size));
-	}
-	return 0;
+  block_size_ = block_size;
+  for (size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+    queues_.push_back(ConcurrentQueue(block_size));
+  }
+  return 0;
 }
 
 bool LRUCache::evict_single_block(BlockType &item) {
-	bool found = false;
-	for(size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
-		found = queues_[i].try_dequeue(item);
-		if(found) {
-			break;
-		}
-	}
-	return found;
-}
-
-bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block, int block_type) {
-	bool ok = queues_[block_type].try_enqueue(block);
-	evict_queue_insertions_.fetch_add(1, std::memory_order_relaxed);
-	if(evict_queue_insertions_ % block_size_ == 0) {
-		this->clear_dead_node(lp_map);
-	}
-	return ok;
+  bool found = false;
+  for (size_t i = 0; i < CATCH_QUEUE_NUM; i++) {
+    found = queues_[i].try_dequeue(item);
+    if (found) {
+      break;
+    }
+  }
+  return found;
+}
+
+bool LRUCache::add_single_block(const LPMap *lp_map, const BlockType &block,
+                                int block_type) {
+  bool ok = queues_[block_type].try_enqueue(block);
+  evict_queue_insertions_.fetch_add(1, std::memory_order_relaxed);
+  if (evict_queue_insertions_ % block_size_ == 0) {
+    this->clear_dead_node(lp_map);
+  }
+  return ok;
 }
 
 void LRUCache::clear_dead_node(const LPMap *lp_map) {
-	for(int i = 0; i < CATCH_QUEUE_NUM; i++) {
-		int clear_size = block_size_ * 2;
-		if (queues_[i].size_approx() < clear_size * 4) {
-			continue;
-		}
-		int clear_count = 0;
-		ConcurrentQueue tmp(block_size_);
-		BlockType item;
-		while(queues_[i].try_dequeue(item) && (clear_count++ < clear_size)) {
-			if(!lp_map->isDeadBlock(item)) {
-				tmp.try_enqueue(item);
-			}
-		}
-		while(tmp.try_dequeue(item)) {
-			if(!lp_map->isDeadBlock(item)) {
-				queues_[i].try_enqueue(item);
-			}
-		}
-	}
+  for (int i = 0; i < CATCH_QUEUE_NUM; i++) {
+    int clear_size = block_size_ * 2;
+    if (queues_[i].size_approx() < clear_size * 4) {
+      continue;
+    }
+    int clear_count = 0;
+    ConcurrentQueue tmp(block_size_);
+    BlockType item;
+    while (queues_[i].try_dequeue(item) && (clear_count++ < clear_size)) {
+      if (!lp_map->isDeadBlock(item)) {
+        tmp.try_enqueue(item);
+      }
+    }
+    while (tmp.try_dequeue(item)) {
+      if (!lp_map->isDeadBlock(item)) {
+        queues_[i].try_enqueue(item);
+      }
+    }
+  }
 }
 
 void LPMap::init(size_t entry_num) {
-	if (entries_) {
-		delete[] entries_;
-	}
-	entry_num_ = entry_num;
-	entries_ = new Entry[entry_num_];
-	for (size_t i = 0; i < entry_num_; i++) {
-		entries_[i].ref_count.store(std::numeric_limits<int>::min());
-		entries_[i].load_count.store(0);
-		entries_[i].buffer = nullptr;
-	}
-	cache_.init(entry_num);
-}
-
-char* LPMap::acquire_block(block_id_t block_id) {
-	assert(block_id < entry_num_);
-	Entry &entry = entries_[block_id];
-	if (entry.ref_count.load(std::memory_order_relaxed) == 0) {
-		entry.load_count.fetch_add(1, std::memory_order_relaxed);
-	}
-	entry.ref_count.fetch_add(1, std::memory_order_relaxed);
-	if (entry.ref_count.load(std::memory_order_relaxed) < 0) {
-		return nullptr;
-	}
-	return entry.buffer;
+  if (entries_) {
+    delete[] entries_;
+  }
+  entry_num_ = entry_num;
+  entries_ = new Entry[entry_num_];
+  for (size_t i = 0; i < entry_num_; i++) {
+    entries_[i].ref_count.store(std::numeric_limits<int>::min());
+    entries_[i].load_count.store(0);
+    entries_[i].buffer = nullptr;
+  }
+  cache_.init(entry_num);
+}
+
+char *LPMap::acquire_block(block_id_t block_id) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+  if (entry.ref_count.load(std::memory_order_relaxed) == 0) {
+    entry.load_count.fetch_add(1, std::memory_order_relaxed);
+  }
+  entry.ref_count.fetch_add(1, std::memory_order_relaxed);
+  if (entry.ref_count.load(std::memory_order_relaxed) < 0) {
+    return nullptr;
+  }
+  return entry.buffer;
 }
 
 void LPMap::release_block(block_id_t block_id) {
-	assert(block_id < entry_num_);
-	Entry &entry = entries_[block_id];
-
-	if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) {
-		std::atomic_thread_fence(std::memory_order_acquire);
-		LRUCache::BlockType block;
-		block.first = block_id;
-		block.second = entry.load_count.load();
-		cache_.add_single_block(this, block, 0);
-	}
-}
-
-char* LPMap::evict_block(block_id_t block_id) {
-	assert(block_id < entry_num_);
-	Entry &entry = entries_[block_id];
-	int expected = 0;
-	if (entry.ref_count.compare_exchange_strong(
-					expected, std::numeric_limits<int>::min())) {
-		char *buffer = entry.buffer;
-		entry.buffer = nullptr;
-		return buffer;
-	} else {
-		return nullptr;
-	}
-}
-
-char* LPMap::set_block_acquired(block_id_t block_id, char *buffer) {
-	assert(block_id < entry_num_);
-	Entry &entry = entries_[block_id];
-	if (entry.ref_count.load(std::memory_order_relaxed) >= 0) {
-		entry.ref_count.fetch_add(1, std::memory_order_relaxed);
-		return entry.buffer;
-	}
-	entry.buffer = buffer;
-	entry.ref_count.store(1, std::memory_order_relaxed);
-	entry.load_count.fetch_add(1, std::memory_order_relaxed);
-	return buffer;
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+
+  if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    LRUCache::BlockType block;
+    block.first = block_id;
+    block.second = entry.load_count.load();
+    cache_.add_single_block(this, block, 0);
+  }
+}
+
+char *LPMap::evict_block(block_id_t block_id) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+  int expected = 0;
+  if (entry.ref_count.compare_exchange_strong(
+          expected, std::numeric_limits<int>::min())) {
+    char *buffer = entry.buffer;
+    entry.buffer = nullptr;
+    return buffer;
+  } else {
+    return nullptr;
+  }
+}
+
+char *LPMap::set_block_acquired(block_id_t block_id, char *buffer) {
+  assert(block_id < entry_num_);
+  Entry &entry = entries_[block_id];
+  if (entry.ref_count.load(std::memory_order_relaxed) >= 0) {
+    entry.ref_count.fetch_add(1, std::memory_order_relaxed);
+    return entry.buffer;
+  }
+  entry.buffer = buffer;
+  entry.ref_count.store(1, std::memory_order_relaxed);
+  entry.load_count.fetch_add(1, std::memory_order_relaxed);
+  return buffer;
 }
 
 void LPMap::recycle(moodycamel::ConcurrentQueue<char *> &free_buffers) {
-	LRUCache::BlockType block;
-	do {
-		bool ok = cache_.evict_single_block(block);
-		if(!ok) {
-			return;
-		}
-	} while(isDeadBlock(block));
-	char *buffer = evict_block(block.first);
-	if (buffer) {
-		free_buffers.try_enqueue(buffer);
-	}
-}
-
-VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size)
-		: pool_capacity_(pool_capacity) {
-	fd_ = open(filename.c_str(), O_RDONLY);
-	if (fd_ < 0) {
-		throw std::runtime_error("Failed to open file: " + filename);
-	}
-	struct stat st;
-	if (fstat(fd_, &st) < 0) {
-		throw std::runtime_error("Failed to stat file: " + filename);
-	}
-	file_size_ = st.st_size;
-
-	size_t buffer_num = pool_capacity_ / block_size;
-	size_t block_num = file_size_ / block_size + 500;
-	lp_map_.init(block_num);
-	for (size_t i = 0; i < buffer_num; i++) {
-		char *buffer = (char *)aligned_alloc(64, block_size);
-		if (buffer != nullptr) {
-			bool ok = free_buffers_.try_enqueue(buffer);
-		}
-	}
-	LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num, lp_map_.entry_num());
+  LRUCache::BlockType block;
+  do {
+    bool ok = cache_.evict_single_block(block);
+    if (!ok) {
+      return;
+    }
+  } while (isDeadBlock(block));
+  char *buffer = evict_block(block.first);
+  if (buffer) {
+    free_buffers.try_enqueue(buffer);
+  }
+}
+
+VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
+                             size_t block_size)
+    : pool_capacity_(pool_capacity) {
+  fd_ = open(filename.c_str(), O_RDONLY);
+  if (fd_ < 0) {
+    throw std::runtime_error("Failed to open file: " + filename);
+  }
+  struct stat st;
+  if (fstat(fd_, &st) < 0) {
+    throw std::runtime_error("Failed to stat file: " + filename);
+  }
+  file_size_ = st.st_size;
+
+  size_t buffer_num = pool_capacity_ / block_size;
+  size_t block_num = file_size_ / block_size + 500;
+  lp_map_.init(block_num);
+  for (size_t i = 0; i < buffer_num; i++) {
+    char *buffer = (char *)aligned_alloc(64, block_size);
+    if (buffer != nullptr) {
+      bool ok = free_buffers_.try_enqueue(buffer);
+    }
+  }
+  LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num,
+            lp_map_.entry_num());
 }
 
 VecBufferPoolHandle VecBufferPool::get_handle() {
-	return VecBufferPoolHandle(*this);
-}
-
-char* VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry) {
-	char *buffer = lp_map_.acquire_block(block_id);
-	if (buffer) {
-		return buffer;
-	}
-	{
-		bool found = free_buffers_.try_dequeue(buffer);
-		if (!found) {
-			for (int i = 0; i < retry; i++) {
-				lp_map_.recycle(free_buffers_);
-				found = free_buffers_.try_dequeue(buffer);
-				if (found) {
-					break;
-				}
-			}
-		}
-		if (!found) {
-			LOG_ERROR("Buffer pool failed to get free buffer");
-			return nullptr;
-		}
-	}
-
-	ssize_t read_bytes = pread(fd_, buffer, size, offset);
-	if (read_bytes != static_cast<ssize_t>(size)) {
-		LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
-		return nullptr;
-	}
-	char *placed_buffer = nullptr;
-	{
-		std::lock_guard<std::mutex> lock(mutex_);
-		placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
-	}
-	if (placed_buffer != buffer) {
-		// another thread has set the block
-		free_buffers_.try_enqueue(buffer);
-	}
-	return placed_buffer;
+  return VecBufferPoolHandle(*this);
+}
+
+char *VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset,
+                                    size_t size, int retry) {
+  char *buffer = lp_map_.acquire_block(block_id);
+  if (buffer) {
+    return buffer;
+  }
+  {
+    bool found = free_buffers_.try_dequeue(buffer);
+    if (!found) {
+      for (int i = 0; i < retry; i++) {
+        lp_map_.recycle(free_buffers_);
+        found = free_buffers_.try_dequeue(buffer);
+        if (found) {
+          break;
+        }
+      }
+    }
+    if (!found) {
+      LOG_ERROR("Buffer pool failed to get free buffer");
+      return nullptr;
+    }
+  }
+
+  ssize_t read_bytes = pread(fd_, buffer, size, offset);
+  if (read_bytes != static_cast<ssize_t>(size)) {
+    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+    return nullptr;
+  }
+  char *placed_buffer = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    placed_buffer = lp_map_.set_block_acquired(block_id, buffer);
+  }
+  if (placed_buffer != buffer) {
+    // another thread has set the block
+    free_buffers_.try_enqueue(buffer);
+  }
+  return placed_buffer;
 }
 
 int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
-	ssize_t read_bytes = pread(fd_, buffer, length, offset);
-	if (read_bytes != static_cast<ssize_t>(length)) {
-		LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
-		return -1;
-	}
-	return 0;
+  ssize_t read_bytes = pread(fd_, buffer, length, offset);
+  if (read_bytes != static_cast<ssize_t>(length)) {
+    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
+    return -1;
+  }
+  return 0;
 }
 
-char* VecBufferPoolHandle::get_block(size_t offset, size_t size, size_t block_id) {
-	char *buffer = pool.acquire_buffer(block_id, offset, size, 5);
-	return buffer;
+char *VecBufferPoolHandle::get_block(size_t offset, size_t size,
+                                     size_t block_id) {
+  char *buffer = pool.acquire_buffer(block_id, offset, size, 5);
+  return buffer;
 }
 
 int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) {
-	return pool.get_meta(offset, length, buffer);
+  return pool.get_meta(offset, length, buffer);
 }
 
 void VecBufferPoolHandle::release_one(block_id_t block_id) {
-	pool.lp_map_.release_block(block_id);
+  pool.lp_map_.release_block(block_id);
 }
 
 void VecBufferPoolHandle::acquire_one(block_id_t block_id) {
-	pool.lp_map_.acquire_block(block_id);
+  pool.lp_map_.acquire_block(block_id);
 }
 
 }  // namespace ailego
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index 3765fd15..dcdb13d3 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -15,14 +15,13 @@
 #include <mutex>
 // #include <zvec/ailego/buffer/buffer_manager.h>
 #include <zvec/ailego/buffer/buffer_pool.h>
+#include <zvec/ailego/utility/time_helper.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_factory.h>
 #include <zvec/core/framework/index_mapping.h>
 #include <zvec/core/framework/index_version.h>
 #include "utility_params.h"
 
-#include <zvec/ailego/utility/time_helper.h>
-
 namespace zvec {
 namespace core {
 
@@ -81,7 +80,9 @@ class BufferStorage : public IndexStorage {
         }
         len = meta->data_size - offset;
       }
-      memmove(buf, (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) + offset,
+      memmove(buf,
+              (const uint8_t *)(owner_->get_buffer(offset, len, segment_id_)) +
+                  offset,
               len);
       return len;
     }
@@ -98,7 +99,8 @@ class BufferStorage : public IndexStorage {
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
                              segment_->meta()->data_index + offset;
-      *data = owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset;
+      *data =
+          owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset;
       return len;
     }
 
@@ -113,8 +115,11 @@ class BufferStorage : public IndexStorage {
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
                              segment_->meta()->data_index + offset;
-      data.reset(owner_->buffer_pool_handle_.get(), segment_id_, owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset);
-      // data.reset(owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset);
+      data.reset(
+          owner_->buffer_pool_handle_.get(), segment_id_,
+          owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset);
+      // data.reset(owner_->get_buffer(buffer_offset, capacity_, segment_id_) +
+      // offset);
       if (data.data()) {
         return len;
       } else {
@@ -174,18 +179,20 @@ class BufferStorage : public IndexStorage {
   int open(const std::string &path, bool /*create*/) override {
     LOG_INFO("open buffer storage 1");
     file_name_ = path;
-    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path, 20lu * 1024 * 1024 * 1024, 2490368 * 2);
-    buffer_pool_handle_ =
-        std::make_shared<ailego::VecBufferPoolHandle>(buffer_pool_->get_handle());
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(
+        path, 20lu * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_handle_ = std::make_shared<ailego::VecBufferPoolHandle>(
+        buffer_pool_->get_handle());
     int ret = ParseToMapping();
-    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(), max_segment_size_);
-    for(auto iter = segments_.begin(); iter != segments_.end(); iter++) {
+    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(),
+              max_segment_size_);
+    for (auto iter = segments_.begin(); iter != segments_.end(); iter++) {
       auto seg = this->get(iter->first, 0);
       MemoryBlock block;
       int len = seg->read(0, block, 1);
       LOG_ERROR("segment %s: %d", iter->first.c_str(), len);
     }
-    if(ret != 0) {
+    if (ret != 0) {
       return ret;
     }
     return 0;
@@ -238,8 +245,8 @@ class BufferStorage : public IndexStorage {
   int ParseSegment(size_t offset) {
     segment_buffer_ = std::make_unique<char[]>(footer_.segments_meta_size);
     get_meta(offset, footer_.segments_meta_size, segment_buffer_.get());
-    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size, 0u) !=
-        footer_.segments_meta_crc) {
+    if (ailego::Crc32c::Hash(segment_buffer_.get(), footer_.segments_meta_size,
+                             0u) != footer_.segments_meta_crc) {
       LOG_ERROR("Index segments meta checksum is invalid.");
       return IndexError_InvalidChecksum;
     }
@@ -271,7 +278,8 @@ class BufferStorage : public IndexStorage {
                       iter->segment_id_offset),
           IndexMapping::SegmentInfo{IndexMapping::Segment{iter},
                                     current_header_start_offset_, &header_});
-      max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size);
+      max_segment_size_ =
+          std::max(max_segment_size_, iter->data_size + iter->padding_size);
       if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
           footer_.segments_meta_size) {
         return IndexError_InvalidLength;
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
index 34c69d51..f1a0149c 100644
--- a/src/include/zvec/ailego/buffer/buffer_pool.h
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -11,12 +11,12 @@
 #include <iostream>
 #include <limits>
 #include <map>
+#include <memory>
 #include <mutex>
 #include <queue>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
-#include <memory>
 #include "concurrentqueue.h"
 
 namespace zvec {
@@ -28,23 +28,24 @@ using version_t = size_t;
 class LPMap;
 
 class LRUCache {
-  public:
-    typedef std::pair<block_id_t, version_t> BlockType;
-    typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
+ public:
+  typedef std::pair<block_id_t, version_t> BlockType;
+  typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
 
-    int init(size_t block_size);
+  int init(size_t block_size);
 
-    bool evict_single_block(BlockType &item);
+  bool evict_single_block(BlockType &item);
 
-    bool add_single_block(const LPMap *lp_map, const BlockType &block, int block_type);
+  bool add_single_block(const LPMap *lp_map, const BlockType &block,
+                        int block_type);
 
-    void clear_dead_node(const LPMap *lp_map);
+  void clear_dead_node(const LPMap *lp_map);
 
-  private:
-    constexpr static size_t CATCH_QUEUE_NUM = 3;
-    int block_size_;
-    std::vector<ConcurrentQueue> queues_;
-    alignas(64) std::atomic<size_t> evict_queue_insertions_{0};
+ private:
+  constexpr static size_t CATCH_QUEUE_NUM = 3;
+  int block_size_;
+  std::vector<ConcurrentQueue> queues_;
+  alignas(64) std::atomic<size_t> evict_queue_insertions_{0};
 };
 
 class LPMap {
@@ -95,15 +96,17 @@ class VecBufferPoolHandle;
 class VecBufferPool {
  public:
   typedef std::shared_ptr<VecBufferPool> Pointer;
-  
-  VecBufferPool(const std::string &filename, size_t pool_capacity, size_t block_size);
+
+  VecBufferPool(const std::string &filename, size_t pool_capacity,
+                size_t block_size);
   ~VecBufferPool() {
     close(fd_);
   }
 
   VecBufferPoolHandle get_handle();
 
-  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, int retry = 0);
+  char *acquire_buffer(block_id_t block_id, size_t offset, size_t size,
+                       int retry = 0);
 
   int get_meta(size_t offset, size_t length, char *buffer);
 
@@ -127,11 +130,10 @@ class VecBufferPool {
 struct VecBufferPoolHandle {
   VecBufferPoolHandle(VecBufferPool &pool) : pool(pool), hit_num_(0) {};
   VecBufferPoolHandle(VecBufferPoolHandle &&other)
-      : pool(other.pool),
-        hit_num_(other.hit_num_) {
+      : pool(other.pool), hit_num_(other.hit_num_) {
     other.hit_num_ = 0;
   }
-    
+
   ~VecBufferPoolHandle() = default;
 
   typedef std::shared_ptr<VecBufferPoolHandle> Pointer;
diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
index db4835b1..90edaf97 100644
--- a/src/include/zvec/ailego/buffer/concurrentqueue.h
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -1,5 +1,5 @@
-// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
-// An overview, including benchmark results, is provided here:
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free
+// queue. An overview, including benchmark results, is provided here:
 //     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
 // The full design is also described in excruciating detail at:
 //    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
@@ -8,24 +8,26 @@
 // Copyright (c) 2013-2020, Cameron Desrochers.
 // All rights reserved.
 //
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
 //
-// - Redistributions of source code must retain the above copyright notice, this list of
-// conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice, this list of
-// conditions and the following disclaimer in the documentation and/or other materials
-// provided with the distribution.
+// - Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
 //
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
-// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
-// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 
 // Also dual-licensed under the Boost Software License (see LICENSE.md)
 
@@ -33,8 +35,8 @@
 
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
 // Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
-// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
-// upon assigning any computed values)
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing
+// warnings upon assigning any computed values)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wconversion"
 
@@ -44,10 +46,11 @@
 #endif
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
-// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
-// does not support `if constexpr`, so we have no choice but to simply disable the warning
+// VS2019 with /W4 warns about constant conditional expressions but unless
+// /std=c++17 or higher does not support `if constexpr`, so we have no choice
+// but to simply disable the warning
 #pragma warning(push)
-#pragma warning(disable: 4127)  // conditional expression is constant
+#pragma warning(disable : 4127)  // conditional expression is constant
 #endif
 
 #if defined(__APPLE__)
@@ -57,92 +60,128 @@
 #ifdef MCDBGQ_USE_RELACY
 #include "relacy/relacy_std.hpp"
 #include "relacy_shims.h"
-// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
-// We'll override the default trait malloc ourselves without a macro.
+// We only use malloc/free anyway, and the delete macro messes up `= delete`
+// method declarations. We'll override the default trait malloc ourselves
+// without a macro.
 #undef new
 #undef delete
 #undef malloc
 #undef free
 #else
-#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <atomic>  // Requires C++11. Sorry VS2010.
 #include <cassert>
 #endif
-#include <cstddef>              // for max_align_t
+#include <algorithm>
+#include <array>
+#include <climits>  // for CHAR_BIT
+#include <cstddef>  // for max_align_t
 #include <cstdint>
 #include <cstdlib>
+#include <limits>
+#include <mutex>  // used for thread exit synchronization
+#include <thread>  // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
 #include <type_traits>
-#include <algorithm>
 #include <utility>
-#include <limits>
-#include <climits>		// for CHAR_BIT
-#include <array>
-#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
-#include <mutex>        // used for thread exit synchronization
-
-// Platform-specific definitions of a numeric thread ID type and an invalid value
-namespace moodycamel { namespace details {
-	template<typename thread_id_t> struct thread_id_converter {
-		typedef thread_id_t thread_id_numeric_size_t;
-		typedef thread_id_t thread_id_hash_t;
-		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
-	};
-} }
+
+// Platform-specific definitions of a numeric thread ID type and an invalid
+// value
+namespace moodycamel {
+namespace details {
+template <typename thread_id_t>
+struct thread_id_converter {
+  typedef thread_id_t thread_id_numeric_size_t;
+  typedef thread_id_t thread_id_hash_t;
+  static thread_id_hash_t prehash(thread_id_t const &x) {
+    return x;
+  }
+};
+}  // namespace details
+}  // namespace moodycamel
 #if defined(MCDBGQ_USE_RELACY)
-namespace moodycamel { namespace details {
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
-	static inline thread_id_t thread_id() { return rl::thread_index(); }
-} }
+namespace moodycamel {
+namespace details {
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0xFFFFFFFFU;
+static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+static inline thread_id_t thread_id() {
+  return rl::thread_index();
+}
+}  // namespace details
+}  // namespace moodycamel
 #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
-// No sense pulling in windows.h in a header, we'll manually declare the function
-// we use and rely on backwards-compatibility for this not to break
-extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
-namespace moodycamel { namespace details {
-	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
-	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
-} }
-#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
-namespace moodycamel { namespace details {
-	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
-	
-	typedef std::thread::id thread_id_t;
-	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
-
-	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
-	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
-	// be.
-	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
-
-	template<std::size_t> struct thread_id_size { };
-	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
-	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
-
-	template<> struct thread_id_converter<thread_id_t> {
-		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+// No sense pulling in windows.h in a header, we'll manually declare the
+// function we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(
+    void);
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(unsigned long) == sizeof(std::uint32_t),
+              "Expected size of unsigned long to be 32 bits on Windows");
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id =
+    0;  // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+static const thread_id_t invalid_thread_id2 =
+    0xFFFFFFFFU;  // Not technically guaranteed to be invalid, but is never used
+                  // in practice. Note that all Win32 thread IDs are presently
+                  // multiples of 4.
+static inline thread_id_t thread_id() {
+  return static_cast<thread_id_t>(::GetCurrentThreadId());
+}
+}  // namespace details
+}  // namespace moodycamel
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+    (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) ||  \
+    defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8,
+              "std::thread::id is expected to be either 4 or 8 bytes");
+
+typedef std::thread::id thread_id_t;
+static const thread_id_t invalid_thread_id;  // Default ctor creates invalid ID
+
+// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have
+// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined
+// anyway, which it won't be.
+static inline thread_id_t thread_id() {
+  return std::this_thread::get_id();
+}
+
+template <std::size_t>
+struct thread_id_size {};
+template <>
+struct thread_id_size<4> {
+  typedef std::uint32_t numeric_t;
+};
+template <>
+struct thread_id_size<8> {
+  typedef std::uint64_t numeric_t;
+};
+
+template <>
+struct thread_id_converter<thread_id_t> {
+  typedef thread_id_size<sizeof(thread_id_t)>::numeric_t
+      thread_id_numeric_size_t;
 #ifndef __APPLE__
-		typedef std::size_t thread_id_hash_t;
+  typedef std::size_t thread_id_hash_t;
 #else
-		typedef thread_id_numeric_size_t thread_id_hash_t;
+  typedef thread_id_numeric_size_t thread_id_hash_t;
 #endif
 
-		static thread_id_hash_t prehash(thread_id_t const& x)
-		{
+  static thread_id_hash_t prehash(thread_id_t const &x) {
 #ifndef __APPLE__
-			return std::hash<std::thread::id>()(x);
+    return std::hash<std::thread::id>()(x);
 #else
-			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+    return *reinterpret_cast<thread_id_hash_t const *>(&x);
 #endif
-		}
-	};
-} }
+  }
+};
+}
+}
 #else
 // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
-// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
-// static variable's address as a thread identifier :-)
+// In order to get a numeric thread ID in a platform-independent way, we use a
+// thread-local static variable's address as a thread identifier :-)
 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
 #define MOODYCAMEL_THREADLOCAL __thread
 #elif defined(_MSC_VER)
@@ -151,17 +190,25 @@ namespace moodycamel { namespace details {
 // Assume C++11 compliant compiler
 #define MOODYCAMEL_THREADLOCAL thread_local
 #endif
-namespace moodycamel { namespace details {
-	typedef std::uintptr_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
-	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
-	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
-} }
+namespace moodycamel {
+namespace details {
+typedef std::uintptr_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0;  // Address can't be nullptr
+static const thread_id_t invalid_thread_id2 =
+    1;  // Member accesses off a null pointer are also generally invalid. Plus
+        // it's not aligned.
+inline thread_id_t thread_id() {
+  static MOODYCAMEL_THREADLOCAL int x;
+  return reinterpret_cast<thread_id_t>(&x);
+}
+}
+}
 #endif
 
 // Constexpr if
 #ifndef MOODYCAMEL_CONSTEXPR_IF
-#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \
+    __cplusplus > 201402L
 #define MOODYCAMEL_CONSTEXPR_IF if constexpr
 #define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
 #else
@@ -172,18 +219,20 @@ namespace moodycamel { namespace details {
 
 // Exceptions
 #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
-#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) ||   \
+    (defined(__GNUC__) && defined(__EXCEPTIONS)) || \
+    (!defined(_MSC_VER) && !defined(__GNUC__))
 #define MOODYCAMEL_EXCEPTIONS_ENABLED
 #endif
 #endif
 #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
 #define MOODYCAMEL_TRY try
-#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
 #define MOODYCAMEL_RETHROW throw
-#define MOODYCAMEL_THROW(expr) throw (expr)
+#define MOODYCAMEL_THROW(expr) throw(expr)
 #else
-#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
-#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
 #define MOODYCAMEL_RETHROW
 #define MOODYCAMEL_THROW(expr)
 #endif
@@ -194,15 +243,40 @@ namespace moodycamel { namespace details {
 #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
 #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
-// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
-// We have to assume *all* non-trivial constructors may throw on VS2012!
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when
+// it shouldn't :-( We have to assume *all* non-trivial constructors may throw
+// on VS2012!
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)    \
+  (std::is_rvalue_reference<valueType>::value &&           \
+           std::is_move_constructible<type>::value         \
+       ? std::is_trivially_move_constructible<type>::value \
+       : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)      \
+  ((std::is_rvalue_reference<valueType>::value &&              \
+            std::is_move_assignable<type>::value               \
+        ? std::is_trivially_move_assignable<type>::value ||    \
+              std::is_nothrow_move_assignable<type>::value     \
+        : std::is_trivially_copy_assignable<type>::value ||    \
+              std::is_nothrow_copy_assignable<type>::value) && \
+   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)       \
+  (std::is_rvalue_reference<valueType>::value &&              \
+           std::is_move_constructible<type>::value            \
+       ? std::is_trivially_move_constructible<type>::value || \
+             std::is_nothrow_move_constructible<type>::value  \
+       : std::is_trivially_copy_constructible<type>::value || \
+             std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)      \
+  ((std::is_rvalue_reference<valueType>::value &&              \
+            std::is_move_assignable<type>::value               \
+        ? std::is_trivially_move_assignable<type>::value ||    \
+              std::is_nothrow_move_assignable<type>::value     \
+        : std::is_trivially_copy_assignable<type>::value ||    \
+              std::is_nothrow_copy_assignable<type>::value) && \
+   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #else
 #define MOODYCAMEL_NOEXCEPT noexcept
 #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
@@ -214,18 +288,31 @@ namespace moodycamel { namespace details {
 #ifdef MCDBGQ_USE_RELACY
 #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #else
-// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
-// g++ <=4.7 doesn't support thread_local either.
-// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
-#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
-// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
-#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a
+// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't
+// support thread_local either. Finally, iOS/ARM doesn't have support for it
+// either, and g++/ARM allows it to compile but it's unconfirmed to actually
+// work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) &&                        \
+    (!defined(__MINGW32__) && !defined(__MINGW64__) ||                 \
+     !defined(__WINPTHREADS_VERSION)) &&                               \
+    (!defined(__GNUC__) || __GNUC__ > 4 ||                             \
+     (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) &&                        \
+    (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \
+    !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
+// Assume `thread_local` is fully supported in all other C++11
+// compilers/platforms
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED  // tentatively enabled for now;
+                                                 // years ago several users
+                                                 // report having problems with
+                                                 // it on
 #endif
 #endif
 #endif
 
-// VS2012 doesn't support deleted functions. 
-// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+// VS2012 doesn't support deleted functions.
+// In this case, we declare the function normally but don't define it. A link
+// error will be generated if the function is called.
 #ifndef MOODYCAMEL_DELETE_FUNCTION
 #if defined(_MSC_VER) && _MSC_VER < 1800
 #define MOODYCAMEL_DELETE_FUNCTION
@@ -234,54 +321,101 @@ namespace moodycamel { namespace details {
 #endif
 #endif
 
-namespace moodycamel { namespace details {
+namespace moodycamel {
+namespace details {
 #ifndef MOODYCAMEL_ALIGNAS
-// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+// VS2013 doesn't support alignas or alignof, and align() requires a constant
+// literal
 #if defined(_MSC_VER) && _MSC_VER <= 1800
 #define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
 #define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
-	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
-	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
-	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
-	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
-	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
-	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
-	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
-	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
-	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
-	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
+  typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+template <int Align, typename T>
+struct Vs2013Aligned {};  // default, unsupported alignment
+template <typename T>
+struct Vs2013Aligned<1, T> {
+  typedef __declspec(align(1)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<2, T> {
+  typedef __declspec(align(2)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<4, T> {
+  typedef __declspec(align(4)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<8, T> {
+  typedef __declspec(align(8)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<16, T> {
+  typedef __declspec(align(16)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<32, T> {
+  typedef __declspec(align(32)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<64, T> {
+  typedef __declspec(align(64)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<128, T> {
+  typedef __declspec(align(128)) T type;
+};
+template <typename T>
+struct Vs2013Aligned<256, T> {
+  typedef __declspec(align(256)) T type;
+};
 #else
-	template<typename T> struct identity { typedef T type; };
+template <typename T>
+struct identity {
+  typedef T type;
+};
 #define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
 #define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
+  alignas(alignof(obj)) typename details::identity<T>::type
 #endif
 #endif
-} }
+}  // namespace details
+}  // namespace moodycamel
 
 
-// TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
-// we can apply per-function compile-time suppression.
-// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+// TSAN can false report races in lock-free code.  To enable TSAN to be used
+// from projects that use this one, we can apply per-function compile-time
+// suppression. See
+// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
 #define MOODYCAMEL_NO_TSAN
 #if defined(__has_feature)
- #if __has_feature(thread_sanitizer)
-  #undef MOODYCAMEL_NO_TSAN
-  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
- #endif // TSAN
-#endif // TSAN
+#if __has_feature(thread_sanitizer)
+#undef MOODYCAMEL_NO_TSAN
+#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+#endif  // TSAN
+#endif  // TSAN
 
 // Compiler-specific likely/unlikely hints
-namespace moodycamel { namespace details {
+namespace moodycamel {
+namespace details {
 #if defined(__GNUC__)
-	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
-	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+static inline bool(likely)(bool x) {
+  return __builtin_expect((x), true);
+}
+static inline bool(unlikely)(bool x) {
+  return __builtin_expect((x), false);
+}
 #else
-	static inline bool (likely)(bool x) { return x; }
-	static inline bool (unlikely)(bool x) { return x; }
+static inline bool(likely)(bool x) {
+  return x;
+}
+static inline bool(unlikely)(bool x) {
+  return x;
+}
 #endif
-} }
+}  // namespace details
+}  // namespace moodycamel
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
 #include "internal/concurrentqueue_internal_debug.h"
@@ -289,28 +423,34 @@ namespace moodycamel { namespace details {
 
 namespace moodycamel {
 namespace details {
-	template<typename T>
-	struct const_numeric_max {
-		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
-		static const T value = std::numeric_limits<T>::is_signed
-			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
-			: static_cast<T>(-1);
-	};
+template <typename T>
+struct const_numeric_max {
+  static_assert(std::is_integral<T>::value,
+                "const_numeric_max can only be used with integers");
+  static const T value =
+      std::numeric_limits<T>::is_signed
+          ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) -
+                static_cast<T>(1)
+          : static_cast<T>(-1);
+};
 
 #if defined(__GLIBCXX__)
-	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+typedef ::max_align_t
+    std_max_align_t;  // libstdc++ forgot to add it to std:: for a while
 #else
-	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+typedef std::max_align_t std_max_align_t;  // Others (e.g. MSVC) insist it can
+                                           // *only* be accessed via std::
 #endif
 
-	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
-	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
-	typedef union {
-		std_max_align_t x;
-		long long y;
-		void* z;
-	} max_align_t;
-}
+// Some platforms have incorrectly set max_align_t to a type with <8 bytes
+// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit
+// iOS). Work around this with our own union. See issue #64.
+typedef union {
+  std_max_align_t x;
+  long long y;
+  void *z;
+} max_align_t;
+}  // namespace details
 
 // Default traits for the ConcurrentQueue. To change some of the
 // traits without re-implementing all of them, inherit from this
@@ -318,95 +458,117 @@ namespace details {
 // since the traits are used as a template type parameter, the
 // shadowed declarations will be used where defined, and the defaults
 // otherwise.
-struct ConcurrentQueueDefaultTraits
-{
-	// General-purpose size type. std::size_t is strongly recommended.
-	typedef std::size_t size_t;
-	
-	// The type used for the enqueue and dequeue indices. Must be at least as
-	// large as size_t. Should be significantly larger than the number of elements
-	// you expect to hold at once, especially if you have a high turnover rate;
-	// for example, on 32-bit x86, if you expect to have over a hundred million
-	// elements or pump several million elements through your queue in a very
-	// short space of time, using a 32-bit type *may* trigger a race condition.
-	// A 64-bit int type is recommended in that case, and in practice will
-	// prevent a race condition no matter the usage of the queue. Note that
-	// whether the queue is lock-free with a 64-int type depends on the whether
-	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
-	typedef std::size_t index_t;
-	
-	// Internally, all elements are enqueued and dequeued from multi-element
-	// blocks; this is the smallest controllable unit. If you expect few elements
-	// but many producers, a smaller block size should be favoured. For few producers
-	// and/or many elements, a larger block size is preferred. A sane default
-	// is provided. Must be a power of 2.
-	static const size_t BLOCK_SIZE = 32;
-	
-	// For explicit producers (i.e. when using a producer token), the block is
-	// checked for being empty by iterating through a list of flags, one per element.
-	// For large block sizes, this is too inefficient, and switching to an atomic
-	// counter-based approach is faster. The switch is made for block sizes strictly
-	// larger than this threshold.
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
-	
-	// How many full blocks can be expected for a single explicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
-	
-	// How many full blocks can be expected for a single implicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
-	
-	// The initial size of the hash table mapping thread IDs to implicit producers.
-	// Note that the hash is resized every time it becomes half full.
-	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
-	// (using the enqueue methods without an explicit producer token) is disabled.
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
-	
-	// Controls the number of items that an explicit consumer (i.e. one with a token)
-	// must consume before it causes all consumers to rotate and move on to the next
-	// internal queue.
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
-	
-	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
-	// Enqueue operations that would cause this limit to be surpassed will fail. Note
-	// that this limit is enforced at the block level (for performance reasons), i.e.
-	// it's rounded up to the nearest block size.
-	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
-
-	// The number of times to spin before sleeping when waiting on a semaphore.
-	// Recommended values are on the order of 1000-10000 unless the number of
-	// consumer threads exceeds the number of idle cores (in which case try 0-100).
-	// Only affects instances of the BlockingConcurrentQueue.
-	static const int MAX_SEMA_SPINS = 10000;
-
-	// Whether to recycle dynamically-allocated blocks into an internal free list or
-	// not. If false, only pre-allocated blocks (controlled by the constructor
-	// arguments) will be recycled, and all others will be `free`d back to the heap.
-	// Note that blocks consumed by explicit producers are only freed on destruction
-	// of the queue (not following destruction of the token) regardless of this trait.
-	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
-
-	
+struct ConcurrentQueueDefaultTraits {
+  // General-purpose size type. std::size_t is strongly recommended.
+  typedef std::size_t size_t;
+
+  // The type used for the enqueue and dequeue indices. Must be at least as
+  // large as size_t. Should be significantly larger than the number of elements
+  // you expect to hold at once, especially if you have a high turnover rate;
+  // for example, on 32-bit x86, if you expect to have over a hundred million
+  // elements or pump several million elements through your queue in a very
+  // short space of time, using a 32-bit type *may* trigger a race condition.
+  // A 64-bit int type is recommended in that case, and in practice will
+  // prevent a race condition no matter the usage of the queue. Note that
+  // whether the queue is lock-free with a 64-int type depends on the whether
+  // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+  typedef std::size_t index_t;
+
+  // Internally, all elements are enqueued and dequeued from multi-element
+  // blocks; this is the smallest controllable unit. If you expect few elements
+  // but many producers, a smaller block size should be favoured. For few
+  // producers and/or many elements, a larger block size is preferred. A sane
+  // default is provided. Must be a power of 2.
+  static const size_t BLOCK_SIZE = 32;
+
+  // For explicit producers (i.e. when using a producer token), the block is
+  // checked for being empty by iterating through a list of flags, one per
+  // element. For large block sizes, this is too inefficient, and switching to
+  // an atomic counter-based approach is faster. The switch is made for block
+  // sizes strictly larger than this threshold.
+  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+  // How many full blocks can be expected for a single explicit producer? This
+  // should reflect that number's maximum for optimal performance. Must be a
+  // power of 2.
+  static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+  // How many full blocks can be expected for a single implicit producer? This
+  // should reflect that number's maximum for optimal performance. Must be a
+  // power of 2.
+  static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+  // The initial size of the hash table mapping thread IDs to implicit
+  // producers. Note that the hash is resized every time it becomes half full.
+  // Must be a power of two, and either 0 or at least 1. If 0, implicit
+  // production (using the enqueue methods without an explicit producer token)
+  // is disabled.
+  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+  // Controls the number of items that an explicit consumer (i.e. one with a
+  // token) must consume before it causes all consumers to rotate and move on to
+  // the next internal queue.
+  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+      256;
+
+  // The maximum number of elements (inclusive) that can be enqueued to a
+  // sub-queue. Enqueue operations that would cause this limit to be surpassed
+  // will fail. Note that this limit is enforced at the block level (for
+  // performance reasons), i.e. it's rounded up to the nearest block size.
+  static const size_t MAX_SUBQUEUE_SIZE =
+      details::const_numeric_max<size_t>::value;
+
+  // The number of times to spin before sleeping when waiting on a semaphore.
+  // Recommended values are on the order of 1000-10000 unless the number of
+  // consumer threads exceeds the number of idle cores (in which case try
+  // 0-100). Only affects instances of the BlockingConcurrentQueue.
+  static const int MAX_SEMA_SPINS = 10000;
+
+  // Whether to recycle dynamically-allocated blocks into an internal free list
+  // or not. If false, only pre-allocated blocks (controlled by the constructor
+  // arguments) will be recycled, and all others will be `free`d back to the
+  // heap. Note that blocks consumed by explicit producers are only freed on
+  // destruction of the queue (not following destruction of the token)
+  // regardless of this trait.
+  static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
+
 #ifndef MCDBGQ_USE_RELACY
-	// Memory allocation can be customized if needed.
-	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+  // Memory allocation can be customized if needed.
+  // malloc should return nullptr on failure, and handle alignment like
+  // std::malloc.
 #if defined(malloc) || defined(free)
-	// Gah, this is 2015, stop defining macros that break standard code already!
-	// Work around malloc/free being special macros:
-	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
-	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
-	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
-	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+  // Gah, this is 2015, stop defining macros that break standard code already!
+  // Work around malloc/free being special macros:
+  static inline void *WORKAROUND_malloc(size_t size) {
+    return malloc(size);
+  }
+  static inline void WORKAROUND_free(void *ptr) {
+    return free(ptr);
+  }
+  static inline void *(malloc)(size_t size) {
+    return WORKAROUND_malloc(size);
+  }
+  static inline void(free)(void *ptr) {
+    return WORKAROUND_free(ptr);
+  }
 #else
-	static inline void* malloc(size_t size) { return std::malloc(size); }
-	static inline void free(void* ptr) { return std::free(ptr); }
+  static inline void *malloc(size_t size) {
+    return std::malloc(size);
+  }
+  static inline void free(void *ptr) {
+    return std::free(ptr);
+  }
 #endif
 #else
-	// Debug versions when running under the Relacy race detector (ignore
-	// these in user code)
-	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
-	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+  // Debug versions when running under the Relacy race detector (ignore
+  // these in user code)
+  static inline void *malloc(size_t size) {
+    return rl::rl_malloc(size, $);
+  }
+  static inline void free(void *ptr) {
+    return rl::rl_free(ptr, $);
+  }
 #endif
 };
 
@@ -421,3322 +583,3825 @@ struct ConcurrentQueueDefaultTraits
 struct ProducerToken;
 struct ConsumerToken;
 
-template<typename T, typename Traits> class ConcurrentQueue;
-template<typename T, typename Traits> class BlockingConcurrentQueue;
+template <typename T, typename Traits>
+class ConcurrentQueue;
+template <typename T, typename Traits>
+class BlockingConcurrentQueue;
 class ConcurrentQueueTests;
 
 
-namespace details
-{
-	struct ConcurrentQueueProducerTypelessBase
-	{
-		ConcurrentQueueProducerTypelessBase* next;
-		std::atomic<bool> inactive;
-		ProducerToken* token;
-		
-		ConcurrentQueueProducerTypelessBase()
-			: next(nullptr), inactive(false), token(nullptr)
-		{
-		}
-	};
-	
-	template<bool use32> struct _hash_32_or_64 {
-		static inline std::uint32_t hash(std::uint32_t h)
-		{
-			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
-			// Since the thread ID is already unique, all we really want to do is propagate that
-			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
-			// reducing collisions significantly
-			h ^= h >> 16;
-			h *= 0x85ebca6b;
-			h ^= h >> 13;
-			h *= 0xc2b2ae35;
-			return h ^ (h >> 16);
-		}
-	};
-	template<> struct _hash_32_or_64<1> {
-		static inline std::uint64_t hash(std::uint64_t h)
-		{
-			h ^= h >> 33;
-			h *= 0xff51afd7ed558ccd;
-			h ^= h >> 33;
-			h *= 0xc4ceb9fe1a85ec53;
-			return h ^ (h >> 33);
-		}
-	};
-	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
-	
-	static inline size_t hash_thread_id(thread_id_t id)
-	{
-		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
-		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
-			thread_id_converter<thread_id_t>::prehash(id)));
-	}
-	
-	template<typename T>
-	static inline bool circular_less_than(T a, T b)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
-		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
-		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
-		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
-	}
-	
-	template<typename U>
-	static inline char* align_for(char* ptr)
-	{
-		const std::size_t alignment = std::alignment_of<U>::value;
-		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
-	}
-
-	template<typename T>
-	static inline T ceil_to_pow_2(T x)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
-
-		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-		--x;
-		x |= x >> 1;
-		x |= x >> 2;
-		x |= x >> 4;
-		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
-			x |= x >> (i << 3);
-		}
-		++x;
-		return x;
-	}
-	
-	template<typename T>
-	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
-	{
-		T temp = left.load(std::memory_order_relaxed);
-		left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		right.store(temp, std::memory_order_relaxed);
-	}
-	
-	template<typename T>
-	static inline T const& nomove(T const& x)
-	{
-		return x;
-	}
-	
-	template<bool Enable>
-	struct nomove_if
-	{
-		template<typename T>
-		static inline T const& eval(T const& x)
-		{
-			return x;
-		}
-	};
-	
-	template<>
-	struct nomove_if<false>
-	{
-		template<typename U>
-		static inline auto eval(U&& x)
-			-> decltype(std::forward<U>(x))
-		{
-			return std::forward<U>(x);
-		}
-	};
-	
-	template<typename It>
-	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
-	{
-		return *it;
-	}
-	
-#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
-	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+namespace details {
+struct ConcurrentQueueProducerTypelessBase {
+  ConcurrentQueueProducerTypelessBase *next;
+  std::atomic<bool> inactive;
+  ProducerToken *token;
+
+  ConcurrentQueueProducerTypelessBase()
+      : next(nullptr), inactive(false), token(nullptr) {}
+};
+
+template <bool use32>
+struct _hash_32_or_64 {
+  static inline std::uint32_t hash(std::uint32_t h) {
+    // MurmurHash3 finalizer -- see
+    // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+    // Since the thread ID is already unique, all we really want to do is
+    // propagate that uniqueness evenly across all the bits, so that we can use
+    // a subset of the bits while reducing collisions significantly
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    return h ^ (h >> 16);
+  }
+};
+template <>
+struct _hash_32_or_64<1> {
+  static inline std::uint64_t hash(std::uint64_t h) {
+    h ^= h >> 33;
+    h *= 0xff51afd7ed558ccd;
+    h ^= h >> 33;
+    h *= 0xc4ceb9fe1a85ec53;
+    return h ^ (h >> 33);
+  }
+};
+template <std::size_t size>
+struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {};
+
+static inline size_t hash_thread_id(thread_id_t id) {
+  static_assert(
+      sizeof(thread_id_t) <= 8,
+      "Expected a platform where thread IDs are at most 64-bit values");
+  return static_cast<size_t>(
+      hash_32_or_64<sizeof(
+          thread_id_converter<thread_id_t>::thread_id_hash_t)>::
+          hash(thread_id_converter<thread_id_t>::prehash(id)));
+}
+
+template <typename T>
+static inline bool circular_less_than(T a, T b) {
+  static_assert(
+      std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+      "circular_less_than is intended to be used only with unsigned integer "
+      "types");
+  return static_cast<T>(a - b) >
+         static_cast<T>(static_cast<T>(1)
+                        << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+  // Note: extra parens around rhs of operator<< is MSVC bug:
+  // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+  //       silencing the bug requires #pragma warning(disable: 4554) around the
+  //       calling code and has no effect when done here.
+}
+
+template <typename U>
+static inline char *align_for(char *ptr) {
+  const std::size_t alignment = std::alignment_of<U>::value;
+  return ptr +
+         (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) %
+             alignment;
+}
+
+template <typename T>
+static inline T ceil_to_pow_2(T x) {
+  static_assert(
+      std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+      "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+  // Adapted from
+  // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+    x |= x >> (i << 3);
+  }
+  ++x;
+  return x;
+}
+
+template <typename T>
+static inline void swap_relaxed(std::atomic<T> &left, std::atomic<T> &right) {
+  T temp = left.load(std::memory_order_relaxed);
+  left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
+  right.store(temp, std::memory_order_relaxed);
+}
+
+template <typename T>
+static inline T const &nomove(T const &x) {
+  return x;
+}
+
+template <bool Enable>
+struct nomove_if {
+  template <typename T>
+  static inline T const &eval(T const &x) {
+    return x;
+  }
+};
+
+template <>
+struct nomove_if<false> {
+  template <typename U>
+  static inline auto eval(U &&x) -> decltype(std::forward<U>(x)) {
+    return std::forward<U>(x);
+  }
+};
+
+template <typename It>
+static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
+  return *it;
+}
+
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \
+    (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+template <typename T>
+struct is_trivially_destructible : std::is_trivially_destructible<T> {};
 #else
-	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+template <typename T>
+struct is_trivially_destructible : std::has_trivial_destructor<T> {};
 #endif
-	
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #ifdef MCDBGQ_USE_RELACY
-	typedef RelacyThreadExitListener ThreadExitListener;
-	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+typedef RelacyThreadExitListener ThreadExitListener;
+typedef RelacyThreadExitNotifier ThreadExitNotifier;
 #else
-	class ThreadExitNotifier;
-
-	struct ThreadExitListener
-	{
-		typedef void (*callback_t)(void*);
-		callback_t callback;
-		void* userData;
-		
-		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
-		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
-	};
-
-	class ThreadExitNotifier
-	{
-	public:
-		static void subscribe(ThreadExitListener* listener)
-		{
-			auto& tlsInst = instance();
-			std::lock_guard<std::mutex> guard(mutex());
-			listener->next = tlsInst.tail;
-			listener->chain = &tlsInst;
-			tlsInst.tail = listener;
-		}
-		
-		static void unsubscribe(ThreadExitListener* listener)
-		{
-			std::lock_guard<std::mutex> guard(mutex());
-			if (!listener->chain) {
-				return;  // race with ~ThreadExitNotifier
-			}
-			auto& tlsInst = *listener->chain;
-			listener->chain = nullptr;
-			ThreadExitListener** prev = &tlsInst.tail;
-			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
-				if (ptr == listener) {
-					*prev = ptr->next;
-					break;
-				}
-				prev = &ptr->next;
-			}
-		}
-		
-	private:
-		ThreadExitNotifier() : tail(nullptr) { }
-		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		
-		~ThreadExitNotifier()
-		{
-			// This thread is about to exit, let everyone know!
-			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
-			std::lock_guard<std::mutex> guard(mutex());
-			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
-				ptr->chain = nullptr;
-				ptr->callback(ptr->userData);
-			}
-		}
-		
-		// Thread-local
-		static inline ThreadExitNotifier& instance()
-		{
-			static thread_local ThreadExitNotifier notifier;
-			return notifier;
-		}
-
-		static inline std::mutex& mutex()
-		{
-			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
-			static std::mutex mutex;
-			return mutex;
-		}
-		
-	private:
-		ThreadExitListener* tail;
-	};
-#endif
-#endif
-	
-	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
-	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
-	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
-	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
-	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
-}
+class ThreadExitNotifier;
+
+struct ThreadExitListener {
+  typedef void (*callback_t)(void *);
+  callback_t callback;
+  void *userData;
+
+  ThreadExitListener *next;   // reserved for use by the ThreadExitNotifier
+  ThreadExitNotifier *chain;  // reserved for use by the ThreadExitNotifier
+};
 
+class ThreadExitNotifier {
+ public:
+  static void subscribe(ThreadExitListener *listener) {
+    auto &tlsInst = instance();
+    std::lock_guard<std::mutex> guard(mutex());
+    listener->next = tlsInst.tail;
+    listener->chain = &tlsInst;
+    tlsInst.tail = listener;
+  }
 
-struct ProducerToken
-{
-	template<typename T, typename Traits>
-	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
-	
-	template<typename T, typename Traits>
-	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
-	
-	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-		: producer(other.producer)
-	{
-		other.producer = nullptr;
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-	}
-	
-	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-	
-	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(producer, other.producer);
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-		if (other.producer != nullptr) {
-			other.producer->token = &other;
-		}
-	}
-	
-	// A token is always valid unless:
-	//     1) Memory allocation failed during construction
-	//     2) It was moved via the move constructor
-	//        (Note: assignment does a swap, leaving both potentially valid)
-	//     3) The associated queue was destroyed
-	// Note that if valid() returns true, that only indicates
-	// that the token is valid for use with a specific queue,
-	// but not which one; that's up to the user to track.
-	inline bool valid() const { return producer != nullptr; }
-	
-	~ProducerToken()
-	{
-		if (producer != nullptr) {
-			producer->token = nullptr;
-			producer->inactive.store(true, std::memory_order_release);
-		}
-	}
-	
-	// Disable copying and assignment
-	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	
-private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-	
-protected:
-	details::ConcurrentQueueProducerTypelessBase* producer;
+  static void unsubscribe(ThreadExitListener *listener) {
+    std::lock_guard<std::mutex> guard(mutex());
+    if (!listener->chain) {
+      return;  // race with ~ThreadExitNotifier
+    }
+    auto &tlsInst = *listener->chain;
+    listener->chain = nullptr;
+    ThreadExitListener **prev = &tlsInst.tail;
+    for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+      if (ptr == listener) {
+        *prev = ptr->next;
+        break;
+      }
+      prev = &ptr->next;
+    }
+  }
+
+ private:
+  ThreadExitNotifier() : tail(nullptr) {}
+  ThreadExitNotifier(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION;
+  ThreadExitNotifier &operator=(ThreadExitNotifier const &)
+      MOODYCAMEL_DELETE_FUNCTION;
+
+  ~ThreadExitNotifier() {
+    // This thread is about to exit, let everyone know!
+    assert(this == &instance() &&
+           "If this assert fails, you likely have a buggy compiler! Change the "
+           "preprocessor conditions such that "
+           "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+    std::lock_guard<std::mutex> guard(mutex());
+    for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+      ptr->chain = nullptr;
+      ptr->callback(ptr->userData);
+    }
+  }
+
+  // Thread-local
+  static inline ThreadExitNotifier &instance() {
+    static thread_local ThreadExitNotifier notifier;
+    return notifier;
+  }
+
+  static inline std::mutex &mutex() {
+    // Must be static because the ThreadExitNotifier could be destroyed while
+    // unsubscribe is called
+    static std::mutex mutex;
+    return mutex;
+  }
+
+ private:
+  ThreadExitListener *tail;
+};
+#endif
+#endif
+
+template <typename T>
+struct static_is_lock_free_num {
+  enum { value = 0 };
+};
+template <>
+struct static_is_lock_free_num<signed char> {
+  enum { value = ATOMIC_CHAR_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<short> {
+  enum { value = ATOMIC_SHORT_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<int> {
+  enum { value = ATOMIC_INT_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<long> {
+  enum { value = ATOMIC_LONG_LOCK_FREE };
+};
+template <>
+struct static_is_lock_free_num<long long> {
+  enum { value = ATOMIC_LLONG_LOCK_FREE };
+};
+template <typename T>
+struct static_is_lock_free
+    : static_is_lock_free_num<typename std::make_signed<T>::type> {};
+template <>
+struct static_is_lock_free<bool> {
+  enum { value = ATOMIC_BOOL_LOCK_FREE };
+};
+template <typename U>
+struct static_is_lock_free<U *> {
+  enum { value = ATOMIC_POINTER_LOCK_FREE };
 };
+}  // namespace details
+
+
+struct ProducerToken {
+  template <typename T, typename Traits>
+  explicit ProducerToken(ConcurrentQueue<T, Traits> &queue);
+
+  template <typename T, typename Traits>
+  explicit ProducerToken(BlockingConcurrentQueue<T, Traits> &queue);
+
+  ProducerToken(ProducerToken &&other) MOODYCAMEL_NOEXCEPT
+      : producer(other.producer) {
+    other.producer = nullptr;
+    if (producer != nullptr) {
+      producer->token = this;
+    }
+  }
+
+  inline ProducerToken &operator=(ProducerToken &&other) MOODYCAMEL_NOEXCEPT {
+    swap(other);
+    return *this;
+  }
+
+  void swap(ProducerToken &other) MOODYCAMEL_NOEXCEPT {
+    std::swap(producer, other.producer);
+    if (producer != nullptr) {
+      producer->token = this;
+    }
+    if (other.producer != nullptr) {
+      other.producer->token = &other;
+    }
+  }
+
+  // A token is always valid unless:
+  //     1) Memory allocation failed during construction
+  //     2) It was moved via the move constructor
+  //        (Note: assignment does a swap, leaving both potentially valid)
+  //     3) The associated queue was destroyed
+  // Note that if valid() returns true, that only indicates
+  // that the token is valid for use with a specific queue,
+  // but not which one; that's up to the user to track.
+  inline bool valid() const {
+    return producer != nullptr;
+  }
+
+  ~ProducerToken() {
+    if (producer != nullptr) {
+      producer->token = nullptr;
+      producer->inactive.store(true, std::memory_order_release);
+    }
+  }
+
+  // Disable copying and assignment
+  ProducerToken(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+  ProducerToken &operator=(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+
+ private:
+  template <typename T, typename Traits>
+  friend class ConcurrentQueue;
+  friend class ConcurrentQueueTests;
+
+ protected:
+  details::ConcurrentQueueProducerTypelessBase *producer;
+};
+
+
+struct ConsumerToken {
+  template <typename T, typename Traits>
+  explicit ConsumerToken(ConcurrentQueue<T, Traits> &q);
+
+  template <typename T, typename Traits>
+  explicit ConsumerToken(BlockingConcurrentQueue<T, Traits> &q);
+
+  ConsumerToken(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT
+      : initialOffset(other.initialOffset),
+        lastKnownGlobalOffset(other.lastKnownGlobalOffset),
+        itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
+        currentProducer(other.currentProducer),
+        desiredProducer(other.desiredProducer) {}
 
+  inline ConsumerToken &operator=(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT {
+    swap(other);
+    return *this;
+  }
 
-struct ConsumerToken
-{
-	template<typename T, typename Traits>
-	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
-	
-	template<typename T, typename Traits>
-	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
-	
-	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
-	{
-	}
-	
-	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-	
-	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(initialOffset, other.initialOffset);
-		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
-		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
-		std::swap(currentProducer, other.currentProducer);
-		std::swap(desiredProducer, other.desiredProducer);
-	}
-	
-	// Disable copying and assignment
-	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-
-private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-	
-private: // but shared with ConcurrentQueue
-	std::uint32_t initialOffset;
-	std::uint32_t lastKnownGlobalOffset;
-	std::uint32_t itemsConsumedFromCurrent;
-	details::ConcurrentQueueProducerTypelessBase* currentProducer;
-	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+  void swap(ConsumerToken &other) MOODYCAMEL_NOEXCEPT {
+    std::swap(initialOffset, other.initialOffset);
+    std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+    std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+    std::swap(currentProducer, other.currentProducer);
+    std::swap(desiredProducer, other.desiredProducer);
+  }
+
+  // Disable copying and assignment
+  ConsumerToken(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+  ConsumerToken &operator=(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+
+ private:
+  template <typename T, typename Traits>
+  friend class ConcurrentQueue;
+  friend class ConcurrentQueueTests;
+
+ private:  // but shared with ConcurrentQueue
+  std::uint32_t initialOffset;
+  std::uint32_t lastKnownGlobalOffset;
+  std::uint32_t itemsConsumedFromCurrent;
+  details::ConcurrentQueueProducerTypelessBase *currentProducer;
+  details::ConcurrentQueueProducerTypelessBase *desiredProducer;
 };
 
 // Need to forward-declare this swap because it's in a namespace.
-// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
-
-
-template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
-class ConcurrentQueue
-{
-public:
-	typedef ::moodycamel::ProducerToken producer_token_t;
-	typedef ::moodycamel::ConsumerToken consumer_token_t;
-	
-	typedef typename Traits::index_t index_t;
-	typedef typename Traits::size_t size_t;
-	
-	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+// See
+// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &b)
+    MOODYCAMEL_NOEXCEPT;
+
+
+template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue {
+ public:
+  typedef ::moodycamel::ProducerToken producer_token_t;
+  typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+  typedef typename Traits::index_t index_t;
+  typedef typename Traits::size_t size_t;
+
+  static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
+      static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+  static const size_t EXPLICIT_INITIAL_INDEX_SIZE =
+      static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+  static const size_t IMPLICIT_INITIAL_INDEX_SIZE =
+      static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
+      static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+      static_cast<std::uint32_t>(
+          Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
-#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#pragma warning(disable : 4307)  // + integral constant overflow (that's what
+                                 // the ternary expression is for!)
+#pragma warning(disable : 4309)  // static_cast: Truncation of constant value
 #endif
-	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+  static const size_t MAX_SUBQUEUE_SIZE =
+      (details::const_numeric_max<size_t>::value -
+           static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) <
+       BLOCK_SIZE)
+          ? details::const_numeric_max<size_t>::value
+          : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) +
+              (BLOCK_SIZE - 1)) /
+             BLOCK_SIZE * BLOCK_SIZE);
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
 
-	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
-	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
-	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
-	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
-	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
-	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
-	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
-
-public:
-	// Creates a queue with at least `capacity` element slots; note that the
-	// actual number of elements that can be inserted without additional memory
-	// allocation depends on the number of producers and the block size (e.g. if
-	// the block size is equal to `capacity`, only a single block will be allocated
-	// up-front, which means only a single producer will be able to enqueue elements
-	// without an extra allocation -- blocks aren't shared between producers).
-	// This method is not thread safe -- it is up to the user to ensure that the
-	// queue is fully constructed before it starts being used by other threads (this
-	// includes making the memory effects of construction visible, possibly with a
-	// memory barrier).
-	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
-		
-#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		// Track all the producers using a fully-resolved typed list for
-		// each kind; this makes it possible to debug them starting from
-		// the root queue object (otherwise wacky casts are needed that
-		// don't compile in the debugger's expression evaluator).
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-	
-	// Computes the correct amount of pre-allocated blocks for you based
-	// on the minimum number of elements you want available at any given
-	// time, and the maximum concurrent number of each type of producer.
-	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
-		populate_initial_block_list(blocks);
-		
-#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-	
-	// Note: The queue should not be accessed concurrently while it's
-	// being deleted. It's up to the user to synchronize this.
-	// This method is not thread safe.
-	~ConcurrentQueue()
-	{
-		// Destroy producers
-		auto ptr = producerListTail.load(std::memory_order_relaxed);
-		while (ptr != nullptr) {
-			auto next = ptr->next_prod();
-			if (ptr->token != nullptr) {
-				ptr->token->producer = nullptr;
-			}
-			destroy(ptr);
-			ptr = next;
-		}
-		
-		// Destroy implicit producer hash tables
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
-			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
-			while (hash != nullptr) {
-				auto prev = hash->prev;
-				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
-					for (size_t i = 0; i != hash->capacity; ++i) {
-						hash->entries[i].~ImplicitProducerKVP();
-					}
-					hash->~ImplicitProducerHash();
-					(Traits::free)(hash);
-				}
-				hash = prev;
-			}
-		}
-		
-		// Destroy global free list
-		auto block = freeList.head_unsafe();
-		while (block != nullptr) {
-			auto next = block->freeListNext.load(std::memory_order_relaxed);
-			if (block->dynamicallyAllocated) {
-				destroy(block);
-			}
-			block = next;
-		}
-		
-		// Destroy initial free list
-		destroy_array(initialBlockPool, initialBlockPoolSize);
-	}
-
-	// Disable copying and copy assignment
-	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	
-	// Moving is supported, but note that it is *not* a thread-safe operation.
-	// Nobody can use the queue while it's being moved, and the memory effects
-	// of that move must be propagated to other threads before they can use it.
-	// Note: When a queue is moved, its tokens are still valid but can only be
-	// used with the destination queue (i.e. semantically they are moved along
-	// with the queue itself).
-	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
-		producerCount(other.producerCount.load(std::memory_order_relaxed)),
-		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
-		initialBlockPool(other.initialBlockPool),
-		initialBlockPoolSize(other.initialBlockPoolSize),
-		freeList(std::move(other.freeList)),
-		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
-		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
-	{
-		// Move the other one into this, and leave the other one as an empty queue
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		swap_implicit_producer_hashes(other);
-		
-		other.producerListTail.store(nullptr, std::memory_order_relaxed);
-		other.producerCount.store(0, std::memory_order_relaxed);
-		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
-		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
-		
-#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-		
-		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
-		other.initialBlockPoolSize = 0;
-		other.initialBlockPool = nullptr;
-		
-		reown_producers();
-	}
-	
-	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-	{
-		return swap_internal(other);
-	}
-	
-	// Swaps this queue's state with the other's. Not thread-safe.
-	// Swapping two queues does not invalidate their tokens, however
-	// the tokens that were created for one queue must be used with
-	// only the swapped queue (i.e. the tokens are tied to the
-	// queue's movable state, not the object itself).
-	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap_internal(other);
-	}
-	
-private:
-	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
-	{
-		if (this == &other) {
-			return *this;
-		}
-		
-		details::swap_relaxed(producerListTail, other.producerListTail);
-		details::swap_relaxed(producerCount, other.producerCount);
-		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
-		std::swap(initialBlockPool, other.initialBlockPool);
-		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
-		freeList.swap(other.freeList);
-		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
-		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
-		
-		swap_implicit_producer_hashes(other);
-		
-		reown_producers();
-		other.reown_producers();
-		
-#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		details::swap_relaxed(explicitProducers, other.explicitProducers);
-		details::swap_relaxed(implicitProducers, other.implicitProducers);
-#endif
-		
-		return *this;
-	}
-	
-public:
-	// Enqueues a single item (by copying it).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(std::move(item));
-	}
-	
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CanAlloc>(token, item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CanAlloc>(token, std::move(item));
-	}
-	
-	// Enqueues several items.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
-	}
-	
-	// Enqueues several items using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails
-	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
-	}
-	
-	// Enqueues a single item (by copying it).
-	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible).
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(std::move(item));
-	}
-	
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, std::move(item));
-	}
-	
-	// Enqueues several items.
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
-	}
-	
-	// Enqueues several items using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
-	}
-	
-	
-	
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(U& item)
-	{
-		// Instead of simply trying each producer in turn (which could cause needless contention on the first
-		// producer), we score them heuristically.
-		size_t nonEmptyCount = 0;
-		ProducerBase* best = nullptr;
-		size_t bestSize = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
-			auto size = ptr->size_approx();
-			if (size > 0) {
-				if (size > bestSize) {
-					bestSize = size;
-					best = ptr;
-				}
-				++nonEmptyCount;
-			}
-		}
-		
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (nonEmptyCount > 0) {
-			if ((details::likely)(best->dequeue(item))) {
-				return true;
-			}
-			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-				if (ptr != best && ptr->dequeue(item)) {
-					return true;
-				}
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// This differs from the try_dequeue(item) method in that this one does
-	// not attempt to reduce contention by interleaving the order that producer
-	// streams are dequeued from. So, using this method can reduce overall throughput
-	// under contention, but will give more predictable results in single-threaded
-	// consumer scenarios. This is mostly only useful for internal unit tests.
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue_non_interleaved(U& item)
-	{
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->dequeue(item)) {
-				return true;
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue from the queue using an explicit consumer token.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(consumer_token_t& token, U& item)
-	{
-		// The idea is roughly as follows:
-		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
-		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
-		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
-		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
-		
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return false;
-			}
-		}
-		
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
-			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return true;
-		}
-		
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			if (ptr->dequeue(item)) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = 1;
-				return true;
-			}
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue several elements from the queue.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(It itemFirst, size_t max)
-	{
-		size_t count = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			count += ptr->dequeue_bulk(itemFirst, max - count);
-			if (count == max) {
-				break;
-			}
-		}
-		return count;
-	}
-	
-	// Attempts to dequeue several elements from the queue using an explicit consumer token.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
-	{
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return 0;
-			}
-		}
-		
-		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
-		if (count == max) {
-			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return max;
-		}
-		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
-		max -= count;
-		
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
-			count += dequeued;
-			if (dequeued != 0) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
-			}
-			if (dequeued == max) {
-				break;
-			}
-			max -= dequeued;
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return count;
-	}
-	
-	
-	
-	// Attempts to dequeue from a specific producer's inner queue.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns false if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
-	}
-	
-	// Attempts to dequeue several elements from a specific producer's inner queue.
-	// Returns the number of items actually dequeued.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns 0 if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
-	}
-	
-	
-	// Returns an estimate of the total number of elements currently in the queue. This
-	// estimate is only accurate if the queue has completely stabilized before it is called
-	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
-	// visible on the calling thread, and no further operations start while this method is
-	// being called).
-	// Thread-safe.
-	size_t size_approx() const
-	{
-		size_t size = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			size += ptr->size_approx();
-		}
-		return size;
-	}
-	
-	
-	// Returns true if the underlying atomic variables used by
-	// the queue are lock-free (they should be on most platforms).
-	// Thread-safe.
-	static constexpr bool is_lock_free()
-	{
-		return
-			details::static_is_lock_free<bool>::value == 2 &&
-			details::static_is_lock_free<size_t>::value == 2 &&
-			details::static_is_lock_free<std::uint32_t>::value == 2 &&
-			details::static_is_lock_free<index_t>::value == 2 &&
-			details::static_is_lock_free<void*>::value == 2 &&
-			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
-	}
-
-
-private:
-	friend struct ProducerToken;
-	friend struct ConsumerToken;
-	struct ExplicitProducer;
-	friend struct ExplicitProducer;
-	struct ImplicitProducer;
-	friend struct ImplicitProducer;
-	friend class ConcurrentQueueTests;
-		
-	enum AllocationMode { CanAlloc, CannotAlloc };
-	
-	
-	///////////////////////////////
-	// Queue methods
-	///////////////////////////////
-	
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(producer_token_t const& token, U&& element)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-	
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(U&& element)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-	
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-	
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-	
-	inline bool update_current_producer_after_rotation(consumer_token_t& token)
-	{
-		// Ah, there's been a rotation, figure out where we should be!
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		if (token.desiredProducer == nullptr && tail == nullptr) {
-			return false;
-		}
-		auto prodCount = producerCount.load(std::memory_order_relaxed);
-		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
-		if ((details::unlikely)(token.desiredProducer == nullptr)) {
-			// Aha, first time we're dequeueing anything.
-			// Figure out our local position
-			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
-			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
-			token.desiredProducer = tail;
-			for (std::uint32_t i = 0; i != offset; ++i) {
-				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-				if (token.desiredProducer == nullptr) {
-					token.desiredProducer = tail;
-				}
-			}
-		}
-		
-		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
-		if (delta >= prodCount) {
-			delta = delta % prodCount;
-		}
-		for (std::uint32_t i = 0; i != delta; ++i) {
-			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-			if (token.desiredProducer == nullptr) {
-				token.desiredProducer = tail;
-			}
-		}
-		
-		token.lastKnownGlobalOffset = globalOffset;
-		token.currentProducer = token.desiredProducer;
-		token.itemsConsumedFromCurrent = 0;
-		return true;
-	}
-	
-	
-	///////////////////////////
-	// Free list
-	///////////////////////////
-	
-	template <typename N>
-	struct FreeListNode
-	{
-		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
-		
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<N*> freeListNext;
-	};
-	
-	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
-	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
-	// speedy under low contention.
-	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
-	struct FreeList
-	{
-		FreeList() : freeListHead(nullptr) { }
-		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
-		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
-		
-		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		
-		inline void add(N* node)
-		{
-#ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif		
-			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
-			// set it using a fetch_add
-			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
-				// Oh look! We were the last ones referencing this node, and we know
-				// we want to add it to the free list, so let's do it!
-		 		add_knowing_refcount_is_zero(node);
-			}
-		}
-		
-		inline N* try_get()
-		{
-#ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif		
-			auto head = freeListHead.load(std::memory_order_acquire);
-			while (head != nullptr) {
-				auto prevHead = head;
-				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
-				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
-					head = freeListHead.load(std::memory_order_acquire);
-					continue;
-				}
-				
-				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
-				// next and not worry about it changing between now and the time we do the CAS
-				auto next = head->freeListNext.load(std::memory_order_relaxed);
-				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
-					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
-					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
-					
-					// Decrease refcount twice, once for our ref, and once for the list's ref
-					head->freeListRefs.fetch_sub(2, std::memory_order_release);
-					return head;
-				}
-				
-				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
-				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
-				// count decrement happens-after the CAS on the head.
-				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
-				if (refs == SHOULD_BE_ON_FREELIST + 1) {
-					add_knowing_refcount_is_zero(prevHead);
-				}
-			}
-			
-			return nullptr;
-		}
-		
-		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
-		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
-		
-	private:
-		inline void add_knowing_refcount_is_zero(N* node)
-		{
-			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
-			// only one copy of this method per node at a time, i.e. the single thread case), then we know
-			// we can safely change the next pointer of the node; however, once the refcount is back above
-			// zero, then other threads could increase it (happens under heavy contention, when the refcount
-			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
-			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
-			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
-			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
-			auto head = freeListHead.load(std::memory_order_relaxed);
-			while (true) {
-				node->freeListNext.store(head, std::memory_order_relaxed);
-				node->freeListRefs.store(1, std::memory_order_release);
-				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
-					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
-					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
-						continue;
-					}
-				}
-				return;
-			}
-		}
-		
-	private:
-		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
-		std::atomic<N*> freeListHead;
-	
-	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
-	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
-		
-#ifdef MCDBGQ_NOLOCKFREE_FREELIST
-		debug::DebugMutex mutex;
-#endif
-	};
-	
-	
-	///////////////////////////
-	// Block
-	///////////////////////////
-	
-	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
-	
-	struct Block
-	{
-		Block()
-			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
-		{
-#ifdef MCDBGQ_TRACKMEM
-			owner = nullptr;
-#endif
-		}
-		
-		template<InnerQueueContext context>
-		inline bool is_empty() const
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Check flags
-				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
-					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
-						return false;
-					}
-				}
-				
-				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
-				std::atomic_thread_fence(std::memory_order_acquire);
-				return true;
-			}
-			else {
-				// Check counter
-				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
-					std::atomic_thread_fence(std::memory_order_acquire);
-					return true;
-				}
-				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
-				return false;
-			}
-		}
-		
-		// Returns true if the block is now empty (does not apply in explicit context)
-		template<InnerQueueContext context>
-		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flag
-				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
-				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
-				assert(prevVal < BLOCK_SIZE);
-				return prevVal == BLOCK_SIZE - 1;
-			}
-		}
-		
-		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
-		// Returns true if the block is now empty (does not apply in explicit context).
-		template<InnerQueueContext context>
-		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flags
-				std::atomic_thread_fence(std::memory_order_release);
-				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
-				for (size_t j = 0; j != count; ++j) {
-					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
-					emptyFlags[i + j].store(true, std::memory_order_relaxed);
-				}
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
-				assert(prevVal + count <= BLOCK_SIZE);
-				return prevVal + count == BLOCK_SIZE;
-			}
-		}
-		
-		template<InnerQueueContext context>
-		inline void set_all_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set all flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(true, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
-			}
-		}
-		
-		template<InnerQueueContext context>
-		inline void reset_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Reset flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(false, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
-			}
-		}
-		
-		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		
-	private:
-		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
-		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
-	public:
-		Block* next;
-		std::atomic<size_t> elementsCompletelyDequeued;
-		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
-	public:
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<Block*> freeListNext;
-		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
-		
-#ifdef MCDBGQ_TRACKMEM
-		void* owner;
-#endif
-	};
-	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+  static_assert(!std::numeric_limits<size_t>::is_signed &&
+                    std::is_integral<size_t>::value,
+                "Traits::size_t must be an unsigned integral type");
+  static_assert(!std::numeric_limits<index_t>::is_signed &&
+                    std::is_integral<index_t>::value,
+                "Traits::index_t must be an unsigned integral type");
+  static_assert(sizeof(index_t) >= sizeof(size_t),
+                "Traits::index_t must be at least as wide as Traits::size_t");
+  static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)),
+                "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+  static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) &&
+                    !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD &
+                      (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
+                "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a "
+                "power of 2 (and greater than 1)");
+  static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                    !(EXPLICIT_INITIAL_INDEX_SIZE &
+                      (EXPLICIT_INITIAL_INDEX_SIZE - 1)),
+                "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
+                "greater than 1)");
+  static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                    !(IMPLICIT_INITIAL_INDEX_SIZE &
+                      (IMPLICIT_INITIAL_INDEX_SIZE - 1)),
+                "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
+                "greater than 1)");
+  static_assert(
+      (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) ||
+          !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE &
+            (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
+      "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+  static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 ||
+                    INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
+                "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least "
+                "1 (or 0 to disable implicit enqueueing)");
 
+ public:
+  // Creates a queue with at least `capacity` element slots; note that the
+  // actual number of elements that can be inserted without additional memory
+  // allocation depends on the number of producers and the block size (e.g. if
+  // the block size is equal to `capacity`, only a single block will be
+  // allocated up-front, which means only a single producer will be able to
+  // enqueue elements without an extra allocation -- blocks aren't shared
+  // between producers). This method is not thread safe -- it is up to the user
+  // to ensure that the queue is fully constructed before it starts being used
+  // by other threads (this includes making the memory effects of construction
+  // visible, possibly with a memory barrier).
+  explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+      : producerListTail(nullptr),
+        producerCount(0),
+        initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0),
+        globalExplicitConsumerOffset(0) {
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    populate_initial_block_list(capacity / BLOCK_SIZE +
+                                ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
 
-#ifdef MCDBGQ_TRACKMEM
-public:
-	struct MemStats;
-private:
-#endif
-	
-	///////////////////////////
-	// Producer base
-	///////////////////////////
-	
-	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
-	{
-		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
-			tailIndex(0),
-			headIndex(0),
-			dequeueOptimisticCount(0),
-			dequeueOvercommit(0),
-			tailBlock(nullptr),
-			isExplicit(isExplicit_),
-			parent(parent_)
-		{
-		}
-		
-		virtual ~ProducerBase() { }
-		
-		template<typename U>
-		inline bool dequeue(U& element)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue(element);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue(element);
-			}
-		}
-		
-		template<typename It>
-		inline size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-		}
-		
-		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
-		
-		inline size_t size_approx() const
-		{
-			auto tail = tailIndex.load(std::memory_order_relaxed);
-			auto head = headIndex.load(std::memory_order_relaxed);
-			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
-		}
-		
-		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
-	protected:
-		std::atomic<index_t> tailIndex;		// Where to enqueue to next
-		std::atomic<index_t> headIndex;		// Where to dequeue from next
-		
-		std::atomic<index_t> dequeueOptimisticCount;
-		std::atomic<index_t> dequeueOvercommit;
-		
-		Block* tailBlock;
-		
-	public:
-		bool isExplicit;
-		ConcurrentQueue* parent;
-		
-	protected:
-#ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	///////////////////////////
-	// Explicit queue
-	///////////////////////////
-		
-	struct ExplicitProducer : public ProducerBase
-	{
-		explicit ExplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, true),
-			blockIndex(nullptr),
-			pr_blockIndexSlotsUsed(0),
-			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
-			pr_blockIndexFront(0),
-			pr_blockIndexEntries(nullptr),
-			pr_blockIndexRaw(nullptr)
-		{
-			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
-			if (poolBasedIndexSize > pr_blockIndexSize) {
-				pr_blockIndexSize = poolBasedIndexSize;
-			}
-			
-			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
-		}
-		
-		~ExplicitProducer()
-		{
-			// Destruct any elements not yet dequeued.
-			// Since we're in the destructor, we can assume all elements
-			// are either completely dequeued or completely not (no halfways).
-			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
-				// First find the block that's partially dequeued, if any
-				Block* halfDequeuedBlock = nullptr;
-				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
-					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
-					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
-					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
-					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
-						i = (i + 1) & (pr_blockIndexSize - 1);
-					}
-					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
-					halfDequeuedBlock = pr_blockIndexEntries[i].block;
-				}
-				
-				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
-				auto block = this->tailBlock;
-				do {
-					block = block->next;
-					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-						continue;
-					}
-					
-					size_t i = 0;	// Offset into block
-					if (block == halfDequeuedBlock) {
-						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					}
-					
-					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
-					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
-						(*block)[i++]->~T();
-					}
-				} while (block != this->tailBlock);
-			}
-			
-			// Destroy all blocks that we own
-			if (this->tailBlock != nullptr) {
-				auto block = this->tailBlock;
-				do {
-					auto nextBlock = block->next;
-					this->parent->add_block_to_free_list(block);
-					block = nextBlock;
-				} while (block != this->tailBlock);
-			}
-			
-			// Destroy the block indices
-			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
-			while (header != nullptr) {
-				auto prev = static_cast<BlockIndexHeader*>(header->prev);
-				header->~BlockIndexHeader();
-				(Traits::free)(header);
-				header = prev;
-			}
-		}
-		
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto startBlock = this->tailBlock;
-				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					// We can re-use the block ahead of us, it's empty!					
-					this->tailBlock = this->tailBlock->next;
-					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					
-					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
-					// last block from it first -- except instead of removing then adding, we can just overwrite).
-					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
-					// it would have been re-attempted when adding the first block to the queue; since there is such
-					// a block, a block index must have been successfully allocated.
-				}
-				else {
-					// Whatever head value we see here is >= the last value we saw here (relatively),
-					// and <= its current value. Since we have the most recent tail, the head must be
-					// <= to it.
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
-						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-						// We can't enqueue in another block because there's not enough leeway -- the
-						// tail could surpass the head by the time the block fills up! (Or we'll exceed
-						// the size limit, if the second part of the condition was true.)
-						return false;
-					}
-					// We're going to need a new block; check that the block index has room
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
-						// Hmm, the circular block index is already full -- we'll need
-						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
-						// the initial allocation failed in the constructor.
-						
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							return false;
-						}
-						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
-							return false;
-						}
-					}
-					
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						return false;
-					}
-#ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					++pr_blockIndexSlotsUsed;
-				}
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// The constructor may throw. We want the element not to appear in the queue in
-					// that case (without corrupting the queue):
-					MOODYCAMEL_TRY {
-						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Revert change to the current block, but leave the new block available
-						// for next time
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				else {
-					(void)startBlock;
-					(void)originalBlockIndexSlotsUsed;
-				}
-				
-				// Add block to block index
-				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-				entry.base = currentTailIndex;
-				entry.block = this->tailBlock;
-				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
-				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-			
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				// Might be something to dequeue, let's give it a try
-				
-				// Note that this if is purely for performance purposes in the common case when the queue is
-				// empty and the values are eventually consistent -- we may enter here spuriously.
-				
-				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
-				// change them) and must be the same value at this point (inside the if) as when the if condition was
-				// evaluated.
-
-				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
-				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
-				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
-				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
-				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
-				// unfortunately that can't be shown to be correct using only the C++11 standard.
-				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				// Increment optimistic counter, then check if it went over the boundary
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				
-				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
-				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
-				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
-				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
-				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
-				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
-				
-				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
-				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
-				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					// Guaranteed to be at least one element to dequeue!
-					
-					// Get the index. Note that since there's guaranteed to be at least one element, this
-					// will never exceed tail. We need to do an acquire-release fence here since it's possible
-					// that whatever condition got us to this point was for an earlier enqueued element (that
-					// we already see the memory effects for), but that by the time we increment somebody else
-					// has incremented it, and we need to see the memory effects for *that* element, which is
-					// in such a case is necessarily visible on the thread that incremented it in the first
-					// place with the more current condition (they must have acquired a tail that is at least
-					// as recent).
-					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-					
-					
-					// Determine which block the element is in
-					
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-					
-					// We need to be careful here about subtracting and dividing because of index wrap-around.
-					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
-					// block size (in order to get a correct signed block count offset in all cases):
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
-					
-					// Dequeue
-					auto& el = *((*block)[index]);
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
-						// Make sure the element is still fully dequeued and destroyed even if the assignment
-						// throws
-						struct Guard {
-							Block* block;
-							index_t index;
-							
-							~Guard()
-							{
-								(*block)[index]->~T();
-								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-							}
-						} guard = { block, index };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-					}
-					
-					return true;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
-				}
-			}
-		
-			return false;
-		}
-		
-		template<AllocationMode allocMode, typename It>
-		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			auto originalBlockIndexFront = pr_blockIndexFront;
-			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-			
-			Block* firstAllocatedBlock = nullptr;
-			
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
-				// Allocate as many blocks as possible from ahead
-				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					this->tailBlock = this->tailBlock->next;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-					
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-				
-				// Now allocate as many blocks as necessary from the block pool
-				while (blockBaseDiff > 0) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						
-						// pr_blockIndexFront is updated inside new_block_index, so we need to
-						// update our fallback value too (since we keep the new index even if we
-						// later fail)
-						originalBlockIndexFront = originalBlockIndexSlotsUsed;
-					}
-					
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						return false;
-					}
-					
-#ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-					
-					++pr_blockIndexSlotsUsed;
-					
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-				
-				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
-				// publish the new block index front
-				auto block = firstAllocatedBlock;
-				while (true) {
-					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (block == this->tailBlock) {
-						break;
-					}
-					block = block->next;
-				}
-				
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-				}
-			}
-			
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			auto endBlock = this->tailBlock;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							// Must use copy constructor even if move constructor is available
-							// because we may have to revert if there's an exception.
-							// Sorry about the horrible templated next line, but it was the only way
-							// to disable moving *at compile time*, which is important because a type
-							// may only define a (noexcept) move constructor, and so calls to the
-							// cctor will not compile, even if they are in an if branch that will never
-							// be executed
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Oh dear, an exception's been thrown -- destroy the elements that
-						// were enqueued so far and revert the entire bulk operation (we'll keep
-						// any allocated blocks in our linked list for later, though).
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-						
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			
-			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-				if (firstAllocatedBlock != nullptr)
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-			}
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-				
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-					
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-					
-					// Determine which block the first element is in
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-					
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
-					
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					do {
-						auto firstIndexInBlock = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						auto block = localBlockIndex->entries[indexIndex].block;
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								// It's too late to revert the dequeue, but we can make sure that all
-								// the dequeued objects are properly destroyed and the block index
-								// (and empty count) are properly updated before we propagate the exception
-								do {
-									block = localBlockIndex->entries[indexIndex].block;
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-									
-									firstIndexInBlock = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-								
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-					} while (index != firstIndex + actualCount);
-					
-					return actualCount;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-			
-			return 0;
-		}
-		
-	private:
-		struct BlockIndexEntry
-		{
-			index_t base;
-			Block* block;
-		};
-		
-		struct BlockIndexHeader
-		{
-			size_t size;
-			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
-			BlockIndexEntry* entries;
-			void* prev;
-		};
-		
-		
-		bool new_block_index(size_t numberOfFilledSlotsToExpose)
-		{
-			auto prevBlockSizeMask = pr_blockIndexSize - 1;
-			
-			// Create the new block
-			pr_blockIndexSize <<= 1;
-			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
-			if (newRawPtr == nullptr) {
-				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
-				return false;
-			}
-			
-			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
-			
-			// Copy in all the old indices, if any
-			size_t j = 0;
-			if (pr_blockIndexSlotsUsed != 0) {
-				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
-				do {
-					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
-					i = (i + 1) & prevBlockSizeMask;
-				} while (i != pr_blockIndexFront);
-			}
-			
-			// Update everything
-			auto header = new (newRawPtr) BlockIndexHeader;
-			header->size = pr_blockIndexSize;
-			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
-			header->entries = newBlockIndexEntries;
-			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
-			
-			pr_blockIndexFront = j;
-			pr_blockIndexEntries = newBlockIndexEntries;
-			pr_blockIndexRaw = newRawPtr;
-			blockIndex.store(header, std::memory_order_release);
-			
-			return true;
-		}
-		
-	private:
-		std::atomic<BlockIndexHeader*> blockIndex;
-		
-		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
-		size_t pr_blockIndexSlotsUsed;
-		size_t pr_blockIndexSize;
-		size_t pr_blockIndexFront;		// Next slot (not current)
-		BlockIndexEntry* pr_blockIndexEntries;
-		void* pr_blockIndexRaw;
-		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ExplicitProducer* nextExplicitProducer;
-	private:
-#endif
-		
-#ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	//////////////////////////////////
-	// Implicit queue
-	//////////////////////////////////
-	
-	struct ImplicitProducer : public ProducerBase
-	{			
-		ImplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, false),
-			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
-			blockIndex(nullptr)
-		{
-			new_block_index();
-		}
-		
-		~ImplicitProducer()
-		{
-			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
-			// completed already; this means that all undequeued elements are placed contiguously across
-			// contiguous blocks, and that only the first and last remaining blocks can be only partially
-			// empty (all other remaining blocks must be completely full).
-			
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-			// Unregister ourselves for thread termination notification
-			if (!this->inactive.load(std::memory_order_relaxed)) {
-				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
-			}
-#endif
-			
-			// Destroy all remaining elements!
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto index = this->headIndex.load(std::memory_order_relaxed);
-			Block* block = nullptr;
-			assert(index == tail || details::circular_less_than(index, tail));
-			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
-			while (index != tail) {
-				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
-					if (block != nullptr) {
-						// Free the old block
-						this->parent->add_block_to_free_list(block);
-					}
-					
-					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
-				}
-				
-				((*block)[index])->~T();
-				++index;
-			}
-			// Even if the queue is empty, there's still one block that's not on the free list
-			// (unless the head index reached the end of it, in which case the tail will be poised
-			// to create a new block).
-			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
-				this->parent->add_block_to_free_list(this->tailBlock);
-			}
-			
-			// Destroy block index
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			if (localBlockIndex != nullptr) {
-				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
-					localBlockIndex->index[i]->~BlockIndexEntry();
-				}
-				do {
-					auto prev = localBlockIndex->prev;
-					localBlockIndex->~BlockIndexHeader();
-					(Traits::free)(localBlockIndex);
-					localBlockIndex = prev;
-				} while (localBlockIndex != nullptr);
-			}
-		}
-		
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto head = this->headIndex.load(std::memory_order_relaxed);
-				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-					return false;
-				}
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				// Find out where we'll be inserting this block in the block index
-				BlockIndexEntry* idxEntry;
-				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
-					return false;
-				}
-				
-				// Get ahold of a new block
-				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-				if (newBlock == nullptr) {
-					rewind_block_index_tail();
-					idxEntry->value.store(nullptr, std::memory_order_relaxed);
-					return false;
-				}
-#ifdef MCDBGQ_TRACKMEM
-				newBlock->owner = this;
-#endif
-				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// May throw, try to insert now before we publish the fact that we have this new block
-					MOODYCAMEL_TRY {
-						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						rewind_block_index_tail();
-						idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						this->parent->add_block_to_free_list(newBlock);
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				// Insert the new block into the index
-				idxEntry->value.store(newBlock, std::memory_order_relaxed);
-				
-				this->tailBlock = newBlock;
-				
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-			
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			// See ExplicitProducer::dequeue for rationale and explanation
-			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
-			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-					
-					// Determine which block the element is in
-					auto entry = get_block_index_entry_for_index(index);
-					
-					// Dequeue
-					auto block = entry->value.load(std::memory_order_relaxed);
-					auto& el = *((*block)[index]);
-					
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-						// Note: Acquiring the mutex with every dequeue instead of only when a block
-						// is released is very sub-optimal, but it is, after all, purely debug code.
-						debug::DebugLock lock(producer->mutex);
-#endif
-						struct Guard {
-							Block* block;
-							index_t index;
-							BlockIndexEntry* entry;
-							ConcurrentQueue* parent;
-							
-							~Guard()
-							{
-								(*block)[index]->~T();
-								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-									entry->value.store(nullptr, std::memory_order_relaxed);
-									parent->add_block_to_free_list(block);
-								}
-							}
-						} guard = { block, index, entry, this->parent };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-
-						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-							{
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Add the block back into the global free pool (and remove from block index)
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-					}
-					
-					return true;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
-				}
-			}
-		
-			return false;
-		}
-		
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable: 4706)  // assignment within conditional expression
-#endif
-		template<AllocationMode allocMode, typename It>
-		bool enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			
-			// Note that the tailBlock we start off with may not be owned by us any more;
-			// this happens if it was filled up exactly to the top (setting tailIndex to
-			// the first index of the next block which is not yet allocated), then dequeued
-			// completely (putting it on the free list) before we enqueue again.
-			
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			Block* firstAllocatedBlock = nullptr;
-			auto endBlock = this->tailBlock;
-			
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				do {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					// Find out where we'll be inserting this block in the block index
-					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
-					Block* newBlock;
-					bool indexInserted = false;
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-
-					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
-						// Index allocation or block allocation failed; revert any other allocations
-						// and index insertions done so far for this operation
-						if (indexInserted) {
-							rewind_block_index_tail();
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						}
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						
-						return false;
-					}
-					
-#ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-					newBlock->next = nullptr;
-					
-					// Insert the new block into the index
-					idxEntry->value.store(newBlock, std::memory_order_relaxed);
-					
-					// Store the chain of blocks so that we can undo if later allocations fail,
-					// and so that we can find the blocks when we do the actual enqueueing
-					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
-						assert(this->tailBlock != nullptr);
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					endBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
-				} while (blockBaseDiff > 0);
-			}
-			
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-						
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-#ifdef _MSC_VER
-#pragma warning(pop)
+    // Track all the producers using a fully-resolved typed list for
+    // each kind; this makes it possible to debug them starting from
+    // the root queue object (otherwise wacky casts are needed that
+    // don't compile in the debugger's expression evaluator).
+    explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(nullptr, std::memory_order_relaxed);
 #endif
-		
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-				
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-					
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-					
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					BlockIndexHeader* localBlockIndex;
-					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
-					do {
-						auto blockStartIndex = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						
-						auto entry = localBlockIndex->index[indexIndex];
-						auto block = entry->value.load(std::memory_order_relaxed);
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								do {
-									entry = localBlockIndex->index[indexIndex];
-									block = entry->value.load(std::memory_order_relaxed);
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									
-									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-										debug::DebugLock lock(mutex);
-#endif
-										entry->value.store(nullptr, std::memory_order_relaxed);
-										this->parent->add_block_to_free_list(block);
-									}
-									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-									
-									blockStartIndex = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-								
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
-							{
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
-								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-					} while (index != firstIndex + actualCount);
-					
-					return actualCount;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-			
-			return 0;
-		}
-		
-	private:
-		// The block size must be > 1, so any number with the low bit set is an invalid block base index
-		static const index_t INVALID_BLOCK_BASE = 1;
-		
-		struct BlockIndexEntry
-		{
-			std::atomic<index_t> key;
-			std::atomic<Block*> value;
-		};
-		
-		struct BlockIndexHeader
-		{
-			size_t capacity;
-			std::atomic<size_t> tail;
-			BlockIndexEntry* entries;
-			BlockIndexEntry** index;
-			BlockIndexHeader* prev;
-		};
-		
-		template<AllocationMode allocMode>
-		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
-			if (localBlockIndex == nullptr) {
-				return false;  // this can happen if new_block_index failed in the constructor
-			}
-			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-			idxEntry = localBlockIndex->index[newTail];
-			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
-				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
-				
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-			
-			// No room in the old block index, try to allocate another one!
-			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-				return false;
-			}
-			else if (!new_block_index()) {
-				return false;
-			}
-			else {
-				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-				idxEntry = localBlockIndex->index[newTail];
-				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-		}
-		
-		inline void rewind_block_index_tail()
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
-		}
-		
-		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
-		{
-			BlockIndexHeader* localBlockIndex;
-			auto idx = get_block_index_index_for_index(index, localBlockIndex);
-			return localBlockIndex->index[idx];
-		}
-		
-		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
-		{
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-			debug::DebugLock lock(mutex);
-#endif
-			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
-			localBlockIndex = blockIndex.load(std::memory_order_acquire);
-			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
-			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
-			assert(tailBase != INVALID_BLOCK_BASE);
-			// Note: Must use division instead of shift because the index may wrap around, causing a negative
-			// offset, whose negativity we want to preserve
-			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
-			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
-			return idx;
-		}
-		
-		bool new_block_index()
-		{
-			auto prev = blockIndex.load(std::memory_order_relaxed);
-			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
-			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
-			auto raw = static_cast<char*>((Traits::malloc)(
-				sizeof(BlockIndexHeader) +
-				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
-				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
-			if (raw == nullptr) {
-				return false;
-			}
-			
-			auto header = new (raw) BlockIndexHeader;
-			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
-			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
-			if (prev != nullptr) {
-				auto prevTail = prev->tail.load(std::memory_order_relaxed);
-				auto prevPos = prevTail;
-				size_t i = 0;
-				do {
-					prevPos = (prevPos + 1) & (prev->capacity - 1);
-					index[i++] = prev->index[prevPos];
-				} while (prevPos != prevTail);
-				assert(i == prevCapacity);
-			}
-			for (size_t i = 0; i != entryCount; ++i) {
-				new (entries + i) BlockIndexEntry;
-				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
-				index[prevCapacity + i] = entries + i;
-			}
-			header->prev = prev;
-			header->entries = entries;
-			header->index = index;
-			header->capacity = nextBlockIndexCapacity;
-			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
-			
-			blockIndex.store(header, std::memory_order_release);
-			
-			nextBlockIndexCapacity <<= 1;
-			
-			return true;
-		}
-		
-	private:
-		size_t nextBlockIndexCapacity;
-		std::atomic<BlockIndexHeader*> blockIndex;
+  }
+
+  // Computes the correct amount of pre-allocated blocks for you based
+  // on the minimum number of elements you want available at any given
+  // time, and the maximum concurrent number of each type of producer.
+  ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers,
+                  size_t maxImplicitProducers)
+      : producerListTail(nullptr),
+        producerCount(0),
+        initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0),
+        globalExplicitConsumerOffset(0) {
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) *
+                        (maxExplicitProducers + 1) +
+                    2 * (maxExplicitProducers + maxImplicitProducers);
+    populate_initial_block_list(blocks);
 
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	public:
-		details::ThreadExitListener threadExitListener;
-	private:
-#endif
-		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ImplicitProducer* nextImplicitProducer;
-	private:
+    explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(nullptr, std::memory_order_relaxed);
 #endif
+  }
 
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-		mutable debug::DebugMutex mutex;
-#endif
-#ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	//////////////////////////////////
-	// Block pool manipulation
-	//////////////////////////////////
-	
-	void populate_initial_block_list(size_t blockCount)
-	{
-		initialBlockPoolSize = blockCount;
-		if (initialBlockPoolSize == 0) {
-			initialBlockPool = nullptr;
-			return;
-		}
-		
-		initialBlockPool = create_array<Block>(blockCount);
-		if (initialBlockPool == nullptr) {
-			initialBlockPoolSize = 0;
-		}
-		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
-			initialBlockPool[i].dynamicallyAllocated = false;
-		}
-	}
-	
-	inline Block* try_get_block_from_initial_pool()
-	{
-		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
-			return nullptr;
-		}
-		
-		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
-		
-		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
-	}
-	
-	inline void add_block_to_free_list(Block* block)
-	{
-#ifdef MCDBGQ_TRACKMEM
-		block->owner = nullptr;
-#endif
-		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
-			destroy(block);
-		}
-		else {
-			freeList.add(block);
-		}
-	}
-	
-	inline void add_blocks_to_free_list(Block* block)
-	{
-		while (block != nullptr) {
-			auto next = block->next;
-			add_block_to_free_list(block);
-			block = next;
-		}
-	}
-	
-	inline Block* try_get_block_from_free_list()
-	{
-		return freeList.try_get();
-	}
-	
-	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
-	template<AllocationMode canAlloc>
-	Block* requisition_block()
-	{
-		auto block = try_get_block_from_initial_pool();
-		if (block != nullptr) {
-			return block;
-		}
-		
-		block = try_get_block_from_free_list();
-		if (block != nullptr) {
-			return block;
-		}
-		
-		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
-			return create<Block>();
-		}
-		else {
-			return nullptr;
-		}
-	}
-	
+  // Note: The queue should not be accessed concurrently while it's
+  // being deleted. It's up to the user to synchronize this.
+  // This method is not thread safe.
+  ~ConcurrentQueue() {
+    // Destroy producers
+    auto ptr = producerListTail.load(std::memory_order_relaxed);
+    while (ptr != nullptr) {
+      auto next = ptr->next_prod();
+      if (ptr->token != nullptr) {
+        ptr->token->producer = nullptr;
+      }
+      destroy(ptr);
+      ptr = next;
+    }
+
+    // Destroy implicit producer hash tables
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+      auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+      while (hash != nullptr) {
+        auto prev = hash->prev;
+        if (prev != nullptr) {  // The last hash is part of this object and was
+                                // not allocated dynamically
+          for (size_t i = 0; i != hash->capacity; ++i) {
+            hash->entries[i].~ImplicitProducerKVP();
+          }
+          hash->~ImplicitProducerHash();
+          (Traits::free)(hash);
+        }
+        hash = prev;
+      }
+    }
+
+    // Destroy global free list
+    auto block = freeList.head_unsafe();
+    while (block != nullptr) {
+      auto next = block->freeListNext.load(std::memory_order_relaxed);
+      if (block->dynamicallyAllocated) {
+        destroy(block);
+      }
+      block = next;
+    }
+
+    // Destroy initial free list
+    destroy_array(initialBlockPool, initialBlockPoolSize);
+  }
+
+  // Disable copying and copy assignment
+  ConcurrentQueue(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION;
+  ConcurrentQueue &operator=(ConcurrentQueue const &)
+      MOODYCAMEL_DELETE_FUNCTION;
+
+  // Moving is supported, but note that it is *not* a thread-safe operation.
+  // Nobody can use the queue while it's being moved, and the memory effects
+  // of that move must be propagated to other threads before they can use it.
+  // Note: When a queue is moved, its tokens are still valid but can only be
+  // used with the destination queue (i.e. semantically they are moved along
+  // with the queue itself).
+  ConcurrentQueue(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT
+      : producerListTail(
+            other.producerListTail.load(std::memory_order_relaxed)),
+        producerCount(other.producerCount.load(std::memory_order_relaxed)),
+        initialBlockPoolIndex(
+            other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+        initialBlockPool(other.initialBlockPool),
+        initialBlockPoolSize(other.initialBlockPoolSize),
+        freeList(std::move(other.freeList)),
+        nextExplicitConsumerId(
+            other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+        globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(
+            std::memory_order_relaxed)) {
+    // Move the other one into this, and leave the other one as an empty queue
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    swap_implicit_producer_hashes(other);
+
+    other.producerListTail.store(nullptr, std::memory_order_relaxed);
+    other.producerCount.store(0, std::memory_order_relaxed);
+    other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+    other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
 
-#ifdef MCDBGQ_TRACKMEM
-	public:
-		struct MemStats {
-			size_t allocatedBlocks;
-			size_t usedBlocks;
-			size_t freeBlocks;
-			size_t ownedBlocksExplicit;
-			size_t ownedBlocksImplicit;
-			size_t implicitProducers;
-			size_t explicitProducers;
-			size_t elementsEnqueued;
-			size_t blockClassBytes;
-			size_t queueClassBytes;
-			size_t implicitBlockIndexBytes;
-			size_t explicitBlockIndexBytes;
-			
-			friend class ConcurrentQueue;
-			
-		private:
-			static MemStats getFor(ConcurrentQueue* q)
-			{
-				MemStats stats = { 0 };
-				
-				stats.elementsEnqueued = q->size_approx();
-			
-				auto block = q->freeList.head_unsafe();
-				while (block != nullptr) {
-					++stats.allocatedBlocks;
-					++stats.freeBlocks;
-					block = block->freeListNext.load(std::memory_order_relaxed);
-				}
-				
-				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
-					stats.implicitProducers += implicit ? 1 : 0;
-					stats.explicitProducers += implicit ? 0 : 1;
-					
-					if (implicit) {
-						auto prod = static_cast<ImplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ImplicitProducer);
-						auto head = prod->headIndex.load(std::memory_order_relaxed);
-						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
-						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
-						if (hash != nullptr) {
-							for (size_t i = 0; i != hash->capacity; ++i) {
-								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
-									++stats.allocatedBlocks;
-									++stats.ownedBlocksImplicit;
-								}
-							}
-							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
-							for (; hash != nullptr; hash = hash->prev) {
-								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
-							}
-						}
-						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
-							//auto block = prod->get_block_index_entry_for_index(head);
-							++stats.usedBlocks;
-						}
-					}
-					else {
-						auto prod = static_cast<ExplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ExplicitProducer);
-						auto tailBlock = prod->tailBlock;
-						bool wasNonEmpty = false;
-						if (tailBlock != nullptr) {
-							auto block = tailBlock;
-							do {
-								++stats.allocatedBlocks;
-								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
-									++stats.usedBlocks;
-									wasNonEmpty = wasNonEmpty || block != tailBlock;
-								}
-								++stats.ownedBlocksExplicit;
-								block = block->next;
-							} while (block != tailBlock);
-						}
-						auto index = prod->blockIndex.load(std::memory_order_relaxed);
-						while (index != nullptr) {
-							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
-							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
-						}
-					}
-				}
-				
-				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
-				stats.allocatedBlocks += freeOnInitialPool;
-				stats.freeBlocks += freeOnInitialPool;
-				
-				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
-				stats.queueClassBytes += sizeof(ConcurrentQueue);
-				
-				return stats;
-			}
-		};
-		
-		// For debugging only. Not thread-safe.
-		MemStats getMemStats()
-		{
-			return MemStats::getFor(this);
-		}
-	private:
-		friend struct MemStats;
-#endif
-	
-	
-	//////////////////////////////////
-	// Producer list manipulation
-	//////////////////////////////////	
-	
-	ProducerBase* recycle_or_create_producer(bool isExplicit)
-	{
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		// Try to re-use one first
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
-				bool expected = true;
-				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// We caught one! It's been marked as activated, the caller can have it
-					return ptr;
-				}
-			}
-		}
-
-		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
-	}
-	
-	ProducerBase* add_producer(ProducerBase* producer)
-	{
-		// Handle failed memory allocation
-		if (producer == nullptr) {
-			return nullptr;
-		}
-		
-		producerCount.fetch_add(1, std::memory_order_relaxed);
-		
-		// Add it to the lock-free list
-		auto prevTail = producerListTail.load(std::memory_order_relaxed);
-		do {
-			producer->next = prevTail;
-		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
-		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		if (producer->isExplicit) {
-			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
-			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-		else {
-			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
-			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-#endif
-		
-		return producer;
-	}
-	
-	void reown_producers()
-	{
-		// After another instance is moved-into/swapped-with this one, all the
-		// producers we stole still think their parents are the other queue.
-		// So fix them up!
-		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
-			ptr->parent = this;
-		}
-	}
-	
-	
-	//////////////////////////////////
-	// Implicit producer hash
-	//////////////////////////////////
-	
-	struct ImplicitProducerKVP
-	{
-		std::atomic<details::thread_id_t> key;
-		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
-		
-		ImplicitProducerKVP() : value(nullptr) { }
-		
-		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
-			value = other.value;
-		}
-		
-		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			swap(other);
-			return *this;
-		}
-		
-		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
-		{
-			if (this != &other) {
-				details::swap_relaxed(key, other.key);
-				std::swap(value, other.value);
-			}
-		}
-	};
-	
-	template<typename XT, typename XTraits>
-	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
-	
-	struct ImplicitProducerHash
-	{
-		size_t capacity;
-		ImplicitProducerKVP* entries;
-		ImplicitProducerHash* prev;
-	};
-	
-	inline void populate_initial_implicit_producer_hash()
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			implicitProducerHashCount.store(0, std::memory_order_relaxed);
-			auto hash = &initialImplicitProducerHash;
-			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
-			hash->entries = &initialImplicitProducerHashEntries[0];
-			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
-				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-			}
-			hash->prev = nullptr;
-			implicitProducerHash.store(hash, std::memory_order_relaxed);
-		}
-	}
-	
-	void swap_implicit_producer_hashes(ConcurrentQueue& other)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			// Swap (assumes our implicit producer hash is initialized)
-			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
-			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
-			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
-			
-			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
-			
-			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
-			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
-				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &initialImplicitProducerHash;
-			}
-			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
-				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &other.initialImplicitProducerHash;
-			}
-		}
-	}
-	
-	// Only fails (returns nullptr) if memory allocation fails
-	ImplicitProducer* get_or_add_implicit_producer()
-	{
-		// Note that since the data is essentially thread-local (key is thread ID),
-		// there's a reduced need for fences (memory ordering is already consistent
-		// for any individual thread), except for the current table itself.
-		
-		// Start by looking for the thread ID in the current and all previous hash tables.
-		// If it's not found, it must not be in there yet, since this same thread would
-		// have added it previously to one of the tables that we traversed.
-		
-		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
-		
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		
-		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
-		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
-			// Look for the id in this hash
-			auto index = hashedId;
-			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
-				index &= hash->capacity - 1u;
-				
-				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
-				if (probedKey == id) {
-					// Found it! If we had to search several hashes deep, though, we should lazily add it
-					// to the current main hash table to avoid the extended search next time.
-					// Note there's guaranteed to be room in the current hash table since every subsequent
-					// table implicitly reserves space for all previous tables (there's only one
-					// implicitProducerHashCount).
-					auto value = hash->entries[index].value;
-					if (hash != mainHash) {
-						index = hashedId;
-						while (true) {
-							index &= mainHash->capacity - 1u;
-							auto empty = details::invalid_thread_id;
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-							auto reusable = details::invalid_thread_id2;
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
-								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-#else
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-#endif
-								mainHash->entries[index].value = value;
-								break;
-							}
-							++index;
-						}
-					}
-					
-					return value;
-				}
-				if (probedKey == details::invalid_thread_id) {
-					break;		// Not in this hash table
-				}
-				++index;
-			}
-		}
-		
-		// Insert!
-		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
-		while (true) {
-			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
-				// We've acquired the resize lock, try to allocate a bigger hash table.
-				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
-				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
-				// locked block).
-				mainHash = implicitProducerHash.load(std::memory_order_acquire);
-				if (newCount >= (mainHash->capacity >> 1)) {
-					size_t newCapacity = mainHash->capacity << 1;
-					while (newCount >= (newCapacity >> 1)) {
-						newCapacity <<= 1;
-					}
-					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
-					if (raw == nullptr) {
-						// Allocation failed
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-						return nullptr;
-					}
-					
-					auto newHash = new (raw) ImplicitProducerHash;
-					newHash->capacity = static_cast<size_t>(newCapacity);
-					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
-					for (size_t i = 0; i != newCapacity; ++i) {
-						new (newHash->entries + i) ImplicitProducerKVP;
-						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-					}
-					newHash->prev = mainHash;
-					implicitProducerHash.store(newHash, std::memory_order_release);
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-					mainHash = newHash;
-				}
-				else {
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-				}
-			}
-			
-			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
-			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
-			// always be true)
-			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
-				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
-				if (producer == nullptr) {
-					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-					return nullptr;
-				}
-				
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
-				producer->threadExitListener.userData = producer;
-				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
-#endif
-				
-				auto index = hashedId;
-				while (true) {
-					index &= mainHash->capacity - 1u;
-					auto empty = details::invalid_thread_id;
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-					auto reusable = details::invalid_thread_id2;
-					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
-						mainHash->entries[index].value = producer;
-						break;
-					}
-#endif
-					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						mainHash->entries[index].value = producer;
-						break;
-					}
-					++index;
-				}
-				return producer;
-			}
-			
-			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
-			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
-			// we try to allocate ourselves).
-			mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		}
-	}
-	
-#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	void implicit_producer_thread_exited(ImplicitProducer* producer)
-	{
-		// Remove from hash
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		auto hash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		details::thread_id_t probedKey;
-		
-		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
-		// trying to add an entry thinking there's a free slot (because they reused a producer)
-		for (; hash != nullptr; hash = hash->prev) {
-			auto index = hashedId;
-			do {
-				index &= hash->capacity - 1u;
-				probedKey = id;
-				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-					break;
-				}
-				++index;
-			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
-		}
-		
-		// Mark the queue as being recyclable
-		producer->inactive.store(true, std::memory_order_release);
-	}
-	
-	static void implicit_producer_thread_exited_callback(void* userData)
-	{
-		auto producer = static_cast<ImplicitProducer*>(userData);
-		auto queue = producer->parent;
-		queue->implicit_producer_thread_exited(producer);
-	}
-#endif
-	
-	//////////////////////////////////
-	// Utility functions
-	//////////////////////////////////
-
-	template<typename TAlign>
-	static inline void* aligned_malloc(size_t size)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::malloc)(size);
-		else {
-			size_t alignment = std::alignment_of<TAlign>::value;
-			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
-			if (!raw)
-				return nullptr;
-			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
-			*(reinterpret_cast<void**>(ptr) - 1) = raw;
-			return ptr;
-		}
-	}
-
-	template<typename TAlign>
-	static inline void aligned_free(void* ptr)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::free)(ptr);
-		else
-			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
-	}
-
-	template<typename U>
-	static inline U* create_array(size_t count)
-	{
-		assert(count > 0);
-		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
-		if (p == nullptr)
-			return nullptr;
-
-		for (size_t i = 0; i != count; ++i)
-			new (p + i) U();
-		return p;
-	}
-
-	template<typename U>
-	static inline void destroy_array(U* p, size_t count)
-	{
-		if (p != nullptr) {
-			assert(count > 0);
-			for (size_t i = count; i != 0; )
-				(p + --i)->~U();
-		}
-		aligned_free<U>(p);
-	}
-
-	template<typename U>
-	static inline U* create()
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U : nullptr;
-	}
-
-	template<typename U, typename A1>
-	static inline U* create(A1&& a1)
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
-	}
-
-	template<typename U>
-	static inline void destroy(U* p)
-	{
-		if (p != nullptr)
-			p->~U();
-		aligned_free<U>(p);
-	}
-
-private:
-	std::atomic<ProducerBase*> producerListTail;
-	std::atomic<std::uint32_t> producerCount;
-	
-	std::atomic<size_t> initialBlockPoolIndex;
-	Block* initialBlockPool;
-	size_t initialBlockPoolSize;
-	
-#ifndef MCDBGQ_USEDEBUGFREELIST
-	FreeList<Block> freeList;
-#else
-	debug::DebugFreeList<Block> freeList;
-#endif
-	
-	std::atomic<ImplicitProducerHash*> implicitProducerHash;
-	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
-	ImplicitProducerHash initialImplicitProducerHash;
-	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
-	std::atomic_flag implicitProducerHashResizeInProgress;
-	
-	std::atomic<std::uint32_t> nextExplicitConsumerId;
-	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
-	
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-	debug::DebugMutex implicitProdMutex;
+    explicitProducers.store(
+        other.explicitProducers.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(
+        other.implicitProducers.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.implicitProducers.store(nullptr, std::memory_order_relaxed);
 #endif
-	
+
+    other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+    other.initialBlockPoolSize = 0;
+    other.initialBlockPool = nullptr;
+
+    reown_producers();
+  }
+
+  inline ConcurrentQueue &operator=(ConcurrentQueue &&other)
+      MOODYCAMEL_NOEXCEPT {
+    return swap_internal(other);
+  }
+
+  // Swaps this queue's state with the other's. Not thread-safe.
+  // Swapping two queues does not invalidate their tokens, however
+  // the tokens that were created for one queue must be used with
+  // only the swapped queue (i.e. the tokens are tied to the
+  // queue's movable state, not the object itself).
+  inline void swap(ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT {
+    swap_internal(other);
+  }
+
+ private:
+  ConcurrentQueue &swap_internal(ConcurrentQueue &other) {
+    if (this == &other) {
+      return *this;
+    }
+
+    details::swap_relaxed(producerListTail, other.producerListTail);
+    details::swap_relaxed(producerCount, other.producerCount);
+    details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+    std::swap(initialBlockPool, other.initialBlockPool);
+    std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+    freeList.swap(other.freeList);
+    details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+    details::swap_relaxed(globalExplicitConsumerOffset,
+                          other.globalExplicitConsumerOffset);
+
+    swap_implicit_producer_hashes(other);
+
+    reown_producers();
+    other.reown_producers();
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	std::atomic<ExplicitProducer*> explicitProducers;
-	std::atomic<ImplicitProducer*> implicitProducers;
+    details::swap_relaxed(explicitProducers, other.explicitProducers);
+    details::swap_relaxed(implicitProducers, other.implicitProducers);
 #endif
-};
 
+    return *this;
+  }
 
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
-	: producer(queue.recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
-}
+ public:
+  // Enqueues a single item (by copying it).
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(T const &item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CanAlloc>(item);
+  }
 
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
-}
+  // Enqueues a single item (by moving it, if possible).
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(T &&item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CanAlloc>(std::move(item));
+  }
 
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
-}
+  // Enqueues a single item (by copying it) using an explicit producer token.
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(producer_token_t const &token, T const &item) {
+    return inner_enqueue<CanAlloc>(token, item);
+  }
 
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
-}
+  // Enqueues a single item (by moving it, if possible) using an explicit
+  // producer token. Allocates memory if required. Only fails if memory
+  // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would
+  // be surpassed). Thread-safe.
+  inline bool enqueue(producer_token_t const &token, T &&item) {
+    return inner_enqueue<CanAlloc>(token, std::move(item));
+  }
 
-template<typename T, typename Traits>
-inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
+  // Enqueues several items.
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note:
+  // Use std::make_move_iterator if the elements should be moved instead of
+  // copied. Thread-safe.
+  template <typename It>
+  bool enqueue_bulk(It itemFirst, size_t count) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+  }
 
-inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
+  // Enqueues several items using an explicit producer token.
+  // Allocates memory if required. Only fails if memory allocation fails
+  // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool enqueue_bulk(producer_token_t const &token, It itemFirst, size_t count) {
+    return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+  }
 
-inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
+  // Enqueues a single item (by copying it).
+  // Does not allocate memory. Fails if not enough room to enqueue (or implicit
+  // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+  // is 0).
+  // Thread-safe.
+  inline bool try_enqueue(T const &item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CannotAlloc>(item);
+  }
 
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
+  // Enqueues a single item (by moving it, if possible).
+  // Does not allocate memory (except for one-time implicit producer).
+  // Fails if not enough room to enqueue (or implicit production is
+  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+  // Thread-safe.
+  inline bool try_enqueue(T &&item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CannotAlloc>(std::move(item));
+  }
 
-}
+  // Enqueues a single item (by copying it) using an explicit producer token.
+  // Does not allocate memory. Fails if not enough room to enqueue.
+  // Thread-safe.
+  inline bool try_enqueue(producer_token_t const &token, T const &item) {
+    return inner_enqueue<CannotAlloc>(token, item);
+  }
+
+  // Enqueues a single item (by moving it, if possible) using an explicit
+  // producer token. Does not allocate memory. Fails if not enough room to
+  // enqueue. Thread-safe.
+  inline bool try_enqueue(producer_token_t const &token, T &&item) {
+    return inner_enqueue<CannotAlloc>(token, std::move(item));
+  }
+
+  // Enqueues several items.
+  // Does not allocate memory (except for one-time implicit producer).
+  // Fails if not enough room to enqueue (or implicit production is
+  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool try_enqueue_bulk(It itemFirst, size_t count) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+  }
+
+  // Enqueues several items using an explicit producer token.
+  // Does not allocate memory. Fails if not enough room to enqueue.
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool try_enqueue_bulk(producer_token_t const &token, It itemFirst,
+                        size_t count) {
+    return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+  }
+
+
+  // Attempts to dequeue from the queue.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  bool try_dequeue(U &item) {
+    // Instead of simply trying each producer in turn (which could cause
+    // needless contention on the first producer), we score them heuristically.
+    size_t nonEmptyCount = 0;
+    ProducerBase *best = nullptr;
+    size_t bestSize = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+      auto size = ptr->size_approx();
+      if (size > 0) {
+        if (size > bestSize) {
+          bestSize = size;
+          best = ptr;
+        }
+        ++nonEmptyCount;
+      }
+    }
+
+    // If there was at least one non-empty queue but it appears empty at the
+    // time we try to dequeue from it, we need to make sure every queue's been
+    // tried
+    if (nonEmptyCount > 0) {
+      if ((details::likely)(best->dequeue(item))) {
+        return true;
+      }
+      for (auto ptr = producerListTail.load(std::memory_order_acquire);
+           ptr != nullptr; ptr = ptr->next_prod()) {
+        if (ptr != best && ptr->dequeue(item)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue from the queue.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // This differs from the try_dequeue(item) method in that this one does
+  // not attempt to reduce contention by interleaving the order that producer
+  // streams are dequeued from. So, using this method can reduce overall
+  // throughput under contention, but will give more predictable results in
+  // single-threaded consumer scenarios. This is mostly only useful for internal
+  // unit tests. Never allocates. Thread-safe.
+  template <typename U>
+  bool try_dequeue_non_interleaved(U &item) {
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      if (ptr->dequeue(item)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue from the queue using an explicit consumer token.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  bool try_dequeue(consumer_token_t &token, U &item) {
+    // The idea is roughly as follows:
+    // Every 256 items from one producer, make everyone rotate (increase the
+    // global offset) -> this means the highest efficiency consumer dictates the
+    // rotation speed of everyone else, more or less If you see that the global
+    // offset has changed, you must reset your consumption counter and move to
+    // your designated place If there's no items where you're supposed to be,
+    // keep moving until you find a producer with some items If the global
+    // offset has not changed but you've run out of items to consume, move over
+    // from your current position until you find an producer with something in
+    // it
+
+    if (token.desiredProducer == nullptr ||
+        token.lastKnownGlobalOffset !=
+            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+      if (!update_current_producer_after_rotation(token)) {
+        return false;
+      }
+    }
+
+    // If there was at least one non-empty queue but it appears empty at the
+    // time we try to dequeue from it, we need to make sure every queue's been
+    // tried
+    if (static_cast<ProducerBase *>(token.currentProducer)->dequeue(item)) {
+      if (++token.itemsConsumedFromCurrent ==
+          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+      }
+      return true;
+    }
+
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    auto ptr = static_cast<ProducerBase *>(token.currentProducer)->next_prod();
+    if (ptr == nullptr) {
+      ptr = tail;
+    }
+    while (ptr != static_cast<ProducerBase *>(token.currentProducer)) {
+      if (ptr->dequeue(item)) {
+        token.currentProducer = ptr;
+        token.itemsConsumedFromCurrent = 1;
+        return true;
+      }
+      ptr = ptr->next_prod();
+      if (ptr == nullptr) {
+        ptr = tail;
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue several elements from the queue.
+  // Returns the number of items actually dequeued.
+  // Returns 0 if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename It>
+  size_t try_dequeue_bulk(It itemFirst, size_t max) {
+    size_t count = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      count += ptr->dequeue_bulk(itemFirst, max - count);
+      if (count == max) {
+        break;
+      }
+    }
+    return count;
+  }
+
+  // Attempts to dequeue several elements from the queue using an explicit
+  // consumer token. Returns the number of items actually dequeued. Returns 0 if
+  // all producer streams appeared empty at the time they were checked (so, the
+  // queue is likely but not guaranteed to be empty). Never allocates.
+  // Thread-safe.
+  template <typename It>
+  size_t try_dequeue_bulk(consumer_token_t &token, It itemFirst, size_t max) {
+    if (token.desiredProducer == nullptr ||
+        token.lastKnownGlobalOffset !=
+            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+      if (!update_current_producer_after_rotation(token)) {
+        return 0;
+      }
+    }
+
+    size_t count = static_cast<ProducerBase *>(token.currentProducer)
+                       ->dequeue_bulk(itemFirst, max);
+    if (count == max) {
+      if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >=
+          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+      }
+      return max;
+    }
+    token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+    max -= count;
+
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    auto ptr = static_cast<ProducerBase *>(token.currentProducer)->next_prod();
+    if (ptr == nullptr) {
+      ptr = tail;
+    }
+    while (ptr != static_cast<ProducerBase *>(token.currentProducer)) {
+      auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+      count += dequeued;
+      if (dequeued != 0) {
+        token.currentProducer = ptr;
+        token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+      }
+      if (dequeued == max) {
+        break;
+      }
+      max -= dequeued;
+      ptr = ptr->next_prod();
+      if (ptr == nullptr) {
+        ptr = tail;
+      }
+    }
+    return count;
+  }
+
+
+  // Attempts to dequeue from a specific producer's inner queue.
+  // If you happen to know which producer you want to dequeue from, this
+  // is significantly faster than using the general-case try_dequeue methods.
+  // Returns false if the producer's queue appeared empty at the time it
+  // was checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  inline bool try_dequeue_from_producer(producer_token_t const &producer,
+                                        U &item) {
+    return static_cast<ExplicitProducer *>(producer.producer)->dequeue(item);
+  }
+
+  // Attempts to dequeue several elements from a specific producer's inner
+  // queue. Returns the number of items actually dequeued. If you happen to know
+  // which producer you want to dequeue from, this is significantly faster than
+  // using the general-case try_dequeue methods. Returns 0 if the producer's
+  // queue appeared empty at the time it was checked (so, the queue is likely
+  // but not guaranteed to be empty). Never allocates. Thread-safe.
+  template <typename It>
+  inline size_t try_dequeue_bulk_from_producer(producer_token_t const &producer,
+                                               It itemFirst, size_t max) {
+    return static_cast<ExplicitProducer *>(producer.producer)
+        ->dequeue_bulk(itemFirst, max);
+  }
+
+
+  // Returns an estimate of the total number of elements currently in the queue.
+  // This estimate is only accurate if the queue has completely stabilized
+  // before it is called (i.e. all enqueue and dequeue operations have completed
+  // and their memory effects are visible on the calling thread, and no further
+  // operations start while this method is being called). Thread-safe.
+  size_t size_approx() const {
+    size_t size = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      size += ptr->size_approx();
+    }
+    return size;
+  }
+
+
+  // Returns true if the underlying atomic variables used by
+  // the queue are lock-free (they should be on most platforms).
+  // Thread-safe.
+  static constexpr bool is_lock_free() {
+    return details::static_is_lock_free<bool>::value == 2 &&
+           details::static_is_lock_free<size_t>::value == 2 &&
+           details::static_is_lock_free<std::uint32_t>::value == 2 &&
+           details::static_is_lock_free<index_t>::value == 2 &&
+           details::static_is_lock_free<void *>::value == 2 &&
+           details::static_is_lock_free<typename details::thread_id_converter<
+               details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+  }
+
+
+ private:
+  friend struct ProducerToken;
+  friend struct ConsumerToken;
+  struct ExplicitProducer;
+  friend struct ExplicitProducer;
+  struct ImplicitProducer;
+  friend struct ImplicitProducer;
+  friend class ConcurrentQueueTests;
+
+  enum AllocationMode { CanAlloc, CannotAlloc };
+
+
+  ///////////////////////////////
+  // Queue methods
+  ///////////////////////////////
+
+  template <AllocationMode canAlloc, typename U>
+  inline bool inner_enqueue(producer_token_t const &token, U &&element) {
+    return static_cast<ExplicitProducer *>(token.producer)
+        ->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(
+            std::forward<U>(element));
+  }
+
+  template <AllocationMode canAlloc, typename U>
+  inline bool inner_enqueue(U &&element) {
+    auto producer = get_or_add_implicit_producer();
+    return producer == nullptr
+               ? false
+               : producer->ConcurrentQueue::ImplicitProducer::template enqueue<
+                     canAlloc>(std::forward<U>(element));
+  }
+
+  template <AllocationMode canAlloc, typename It>
+  inline bool inner_enqueue_bulk(producer_token_t const &token, It itemFirst,
+                                 size_t count) {
+    return static_cast<ExplicitProducer *>(token.producer)
+        ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(
+            itemFirst, count);
+  }
+
+  template <AllocationMode canAlloc, typename It>
+  inline bool inner_enqueue_bulk(It itemFirst, size_t count) {
+    auto producer = get_or_add_implicit_producer();
+    return producer == nullptr
+               ? false
+               : producer->ConcurrentQueue::ImplicitProducer::
+                     template enqueue_bulk<canAlloc>(itemFirst, count);
+  }
+
+  inline bool update_current_producer_after_rotation(consumer_token_t &token) {
+    // Ah, there's been a rotation, figure out where we should be!
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    if (token.desiredProducer == nullptr && tail == nullptr) {
+      return false;
+    }
+    auto prodCount = producerCount.load(std::memory_order_relaxed);
+    auto globalOffset =
+        globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+    if ((details::unlikely)(token.desiredProducer == nullptr)) {
+      // Aha, first time we're dequeueing anything.
+      // Figure out our local position
+      // Note: offset is from start, not end, but we're traversing from end --
+      // subtract from count first
+      std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+      token.desiredProducer = tail;
+      for (std::uint32_t i = 0; i != offset; ++i) {
+        token.desiredProducer =
+            static_cast<ProducerBase *>(token.desiredProducer)->next_prod();
+        if (token.desiredProducer == nullptr) {
+          token.desiredProducer = tail;
+        }
+      }
+    }
+
+    std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+    if (delta >= prodCount) {
+      delta = delta % prodCount;
+    }
+    for (std::uint32_t i = 0; i != delta; ++i) {
+      token.desiredProducer =
+          static_cast<ProducerBase *>(token.desiredProducer)->next_prod();
+      if (token.desiredProducer == nullptr) {
+        token.desiredProducer = tail;
+      }
+    }
+
+    token.lastKnownGlobalOffset = globalOffset;
+    token.currentProducer = token.desiredProducer;
+    token.itemsConsumedFromCurrent = 0;
+    return true;
+  }
+
+
+  ///////////////////////////
+  // Free list
+  ///////////////////////////
+
+  template <typename N>
+  struct FreeListNode {
+    FreeListNode() : freeListRefs(0), freeListNext(nullptr) {}
+
+    std::atomic<std::uint32_t> freeListRefs;
+    std::atomic<N *> freeListNext;
+  };
+
+  // A simple CAS-based lock-free free list. Not the fastest thing in the world
+  // under heavy contention, but simple and correct (assuming nodes are never
+  // freed until after the free list is destroyed), and fairly speedy under low
+  // contention.
+  template <typename N>  // N must inherit FreeListNode or have the same fields
+                         // (and initialization of them)
+                         struct FreeList {
+    FreeList() : freeListHead(nullptr) {}
+    FreeList(FreeList &&other)
+        : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) {
+      other.freeListHead.store(nullptr, std::memory_order_relaxed);
+    }
+    void swap(FreeList &other) {
+      details::swap_relaxed(freeListHead, other.freeListHead);
+    }
+
+    FreeList(FreeList const &) MOODYCAMEL_DELETE_FUNCTION;
+    FreeList &operator=(FreeList const &) MOODYCAMEL_DELETE_FUNCTION;
+
+    inline void add(N *node) {
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+      debug::DebugLock lock(mutex);
+#endif
+      // We know that the should-be-on-freelist bit is 0 at this point, so it's
+      // safe to set it using a fetch_add
+      if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST,
+                                       std::memory_order_acq_rel) == 0) {
+        // Oh look! We were the last ones referencing this node, and we know
+        // we want to add it to the free list, so let's do it!
+        add_knowing_refcount_is_zero(node);
+      }
+    }
+
+    inline N *try_get() {
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+      debug::DebugLock lock(mutex);
+#endif
+      auto head = freeListHead.load(std::memory_order_acquire);
+      while (head != nullptr) {
+        auto prevHead = head;
+        auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+        if ((refs & REFS_MASK) == 0 ||
+            !head->freeListRefs.compare_exchange_strong(
+                refs, refs + 1, std::memory_order_acquire)) {
+          head = freeListHead.load(std::memory_order_acquire);
+          continue;
+        }
+
+        // Good, reference count has been incremented (it wasn't at zero), which
+        // means we can read the next and not worry about it changing between
+        // now and the time we do the CAS
+        auto next = head->freeListNext.load(std::memory_order_relaxed);
+        if (freeListHead.compare_exchange_strong(head, next,
+                                                 std::memory_order_acquire,
+                                                 std::memory_order_relaxed)) {
+          // Yay, got the node. This means it was on the list, which means
+          // shouldBeOnFreeList must be false no matter the refcount (because
+          // nobody else knows it's been taken off yet, it can't have been put
+          // back on).
+          assert((head->freeListRefs.load(std::memory_order_relaxed) &
+                  SHOULD_BE_ON_FREELIST) == 0);
+
+          // Decrease refcount twice, once for our ref, and once for the list's
+          // ref
+          head->freeListRefs.fetch_sub(2, std::memory_order_release);
+          return head;
+        }
+
+        // OK, the head must have changed on us, but we still need to decrease
+        // the refcount we increased. Note that we don't need to release any
+        // memory effects, but we do need to ensure that the reference count
+        // decrement happens-after the CAS on the head.
+        refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+        if (refs == SHOULD_BE_ON_FREELIST + 1) {
+          add_knowing_refcount_is_zero(prevHead);
+        }
+      }
+
+      return nullptr;
+    }
+
+    // Useful for traversing the list when there's no contention (e.g. to
+    // destroy remaining nodes)
+    N *head_unsafe() const {
+      return freeListHead.load(std::memory_order_relaxed);
+    }
+
+   private:
+    inline void add_knowing_refcount_is_zero(N *node) {
+      // Since the refcount is zero, and nobody can increase it once it's zero
+      // (except us, and we run only one copy of this method per node at a time,
+      // i.e. the single thread case), then we know we can safely change the
+      // next pointer of the node; however, once the refcount is back above
+      // zero, then other threads could increase it (happens under heavy
+      // contention, when the refcount goes to zero in between a load and a
+      // refcount increment of a node in try_get, then back up to something
+      // non-zero, then the refcount increment is done by the other thread) --
+      // so, if the CAS to add the node to the actual list fails, decrease the
+      // refcount and leave the add operation to the next thread who puts the
+      // refcount back at zero (which could be us, hence the loop).
+      auto head = freeListHead.load(std::memory_order_relaxed);
+      while (true) {
+        node->freeListNext.store(head, std::memory_order_relaxed);
+        node->freeListRefs.store(1, std::memory_order_release);
+        if (!freeListHead.compare_exchange_strong(head, node,
+                                                  std::memory_order_release,
+                                                  std::memory_order_relaxed)) {
+          // Hmm, the add failed, but we can only try again when the refcount
+          // goes back to zero
+          if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1,
+                                           std::memory_order_acq_rel) == 1) {
+            continue;
+          }
+        }
+        return;
+      }
+    }
+
+   private:
+    // Implemented like a stack, but where node order doesn't matter (nodes are
+    // inserted out of order under contention)
+    std::atomic<N *> freeListHead;
+
+    static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+    static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+    debug::DebugMutex mutex;
+#endif
+  };
+
+
+  ///////////////////////////
+  // Block
+  ///////////////////////////
+
+  enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+
+  struct Block {
+    Block()
+        : next(nullptr),
+          elementsCompletelyDequeued(0),
+          freeListRefs(0),
+          freeListNext(nullptr),
+          dynamicallyAllocated(true) {
+#ifdef MCDBGQ_TRACKMEM
+      owner = nullptr;
+#endif
+    }
+
+    template <InnerQueueContext context>
+    inline bool is_empty() const {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Check flags
+        for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+          if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+            return false;
+          }
+        }
+
+        // Aha, empty; make sure we have all other memory effects that happened
+        // before the empty flags were set
+        std::atomic_thread_fence(std::memory_order_acquire);
+        return true;
+      }
+      else {
+        // Check counter
+        if (elementsCompletelyDequeued.load(std::memory_order_relaxed) ==
+            BLOCK_SIZE) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          return true;
+        }
+        assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <=
+               BLOCK_SIZE);
+        return false;
+      }
+    }
+
+    // Returns true if the block is now empty (does not apply in explicit
+    // context)
+    template <InnerQueueContext context>
+    inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set flag
+        assert(!emptyFlags[BLOCK_SIZE - 1 -
+                           static_cast<size_t>(
+                               i & static_cast<index_t>(BLOCK_SIZE - 1))]
+                    .load(std::memory_order_relaxed));
+        emptyFlags[BLOCK_SIZE - 1 -
+                   static_cast<size_t>(i &
+                                       static_cast<index_t>(BLOCK_SIZE - 1))]
+            .store(true, std::memory_order_release);
+        return false;
+      }
+      else {
+        // Increment counter
+        auto prevVal =
+            elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
+        assert(prevVal < BLOCK_SIZE);
+        return prevVal == BLOCK_SIZE - 1;
+      }
+    }
+
+    // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping
+    // and count > 0). Returns true if the block is now empty (does not apply in
+    // explicit context).
+    template <InnerQueueContext context>
+    inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i,
+                               size_t count) {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set flags
+        std::atomic_thread_fence(std::memory_order_release);
+        i = BLOCK_SIZE - 1 -
+            static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) -
+            count + 1;
+        for (size_t j = 0; j != count; ++j) {
+          assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+          emptyFlags[i + j].store(true, std::memory_order_relaxed);
+        }
+        return false;
+      }
+      else {
+        // Increment counter
+        auto prevVal = elementsCompletelyDequeued.fetch_add(
+            count, std::memory_order_acq_rel);
+        assert(prevVal + count <= BLOCK_SIZE);
+        return prevVal + count == BLOCK_SIZE;
+      }
+    }
+
+    template <InnerQueueContext context>
+    inline void set_all_empty() {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set all flags
+        for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+          emptyFlags[i].store(true, std::memory_order_relaxed);
+        }
+      }
+      else {
+        // Reset counter
+        elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+      }
+    }
+
+    template <InnerQueueContext context>
+    inline void reset_empty() {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Reset flags
+        for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+          emptyFlags[i].store(false, std::memory_order_relaxed);
+        }
+      }
+      else {
+        // Reset counter
+        elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+      }
+    }
+
+    inline T *operator[](index_t idx) MOODYCAMEL_NOEXCEPT {
+      return static_cast<T *>(static_cast<void *>(elements)) +
+             static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+    }
+    inline T const *operator[](index_t idx) const MOODYCAMEL_NOEXCEPT {
+      return static_cast<T const *>(static_cast<void const *>(elements)) +
+             static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+    }
+
+   private:
+    static_assert(std::alignment_of<T>::value <= sizeof(T),
+                  "The queue does not support types with an alignment greater "
+                  "than their size at this time");
+    MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+
+   public:
+    Block *next;
+    std::atomic<size_t> elementsCompletelyDequeued;
+    std::atomic<bool> emptyFlags
+        [BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+
+   public:
+    std::atomic<std::uint32_t> freeListRefs;
+    std::atomic<Block *> freeListNext;
+    bool dynamicallyAllocated;  // Perhaps a better name for this would be
+                                // 'isNotPartOfInitialBlockPool'
+
+#ifdef MCDBGQ_TRACKMEM
+    void *owner;
+#endif
+  };
+  static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value,
+                "Internal error: Blocks must be at least as aligned as the "
+                "type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+ public:
+  struct MemStats;
+
+ private:
+#endif
+
+  ///////////////////////////
+  // Producer base
+  ///////////////////////////
+
+  struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase {
+    ProducerBase(ConcurrentQueue *parent_, bool isExplicit_)
+        : tailIndex(0),
+          headIndex(0),
+          dequeueOptimisticCount(0),
+          dequeueOvercommit(0),
+          tailBlock(nullptr),
+          isExplicit(isExplicit_),
+          parent(parent_) {}
+
+    virtual ~ProducerBase() {}
+
+    template <typename U>
+    inline bool dequeue(U &element) {
+      if (isExplicit) {
+        return static_cast<ExplicitProducer *>(this)->dequeue(element);
+      } else {
+        return static_cast<ImplicitProducer *>(this)->dequeue(element);
+      }
+    }
+
+    template <typename It>
+    inline size_t dequeue_bulk(It &itemFirst, size_t max) {
+      if (isExplicit) {
+        return static_cast<ExplicitProducer *>(this)->dequeue_bulk(itemFirst,
+                                                                   max);
+      } else {
+        return static_cast<ImplicitProducer *>(this)->dequeue_bulk(itemFirst,
+                                                                   max);
+      }
+    }
+
+    inline ProducerBase *next_prod() const {
+      return static_cast<ProducerBase *>(next);
+    }
+
+    inline size_t size_approx() const {
+      auto tail = tailIndex.load(std::memory_order_relaxed);
+      auto head = headIndex.load(std::memory_order_relaxed);
+      return details::circular_less_than(head, tail)
+                 ? static_cast<size_t>(tail - head)
+                 : 0;
+    }
+
+    inline index_t getTail() const {
+      return tailIndex.load(std::memory_order_relaxed);
+    }
+
+   protected:
+    std::atomic<index_t> tailIndex;  // Where to enqueue to next
+    std::atomic<index_t> headIndex;  // Where to dequeue from next
+
+    std::atomic<index_t> dequeueOptimisticCount;
+    std::atomic<index_t> dequeueOvercommit;
+
+    Block *tailBlock;
+
+   public:
+    bool isExplicit;
+    ConcurrentQueue *parent;
+
+   protected:
+#ifdef MCDBGQ_TRACKMEM
+    friend struct MemStats;
+#endif
+  };
+
+
+  ///////////////////////////
+  // Explicit queue
+  ///////////////////////////
+
+  struct ExplicitProducer : public ProducerBase {
+    explicit ExplicitProducer(ConcurrentQueue *parent_)
+        : ProducerBase(parent_, true),
+          blockIndex(nullptr),
+          pr_blockIndexSlotsUsed(0),
+          pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+          pr_blockIndexFront(0),
+          pr_blockIndexEntries(nullptr),
+          pr_blockIndexRaw(nullptr) {
+      size_t poolBasedIndexSize =
+          details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+      if (poolBasedIndexSize > pr_blockIndexSize) {
+        pr_blockIndexSize = poolBasedIndexSize;
+      }
+
+      new_block_index(0);  // This creates an index with double the number of
+                           // current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+    }
+
+    ~ExplicitProducer() {
+      // Destruct any elements not yet dequeued.
+      // Since we're in the destructor, we can assume all elements
+      // are either completely dequeued or completely not (no halfways).
+      if (this->tailBlock !=
+          nullptr) {  // Note this means there must be a block index too
+        // First find the block that's partially dequeued, if any
+        Block *halfDequeuedBlock = nullptr;
+        if ((this->headIndex.load(std::memory_order_relaxed) &
+             static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+          // The head's not on a block boundary, meaning a block somewhere is
+          // partially dequeued (or the head block is the tail block and was
+          // fully dequeued, but the head/tail are still not on a boundary)
+          size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) &
+                     (pr_blockIndexSize - 1);
+          while (details::circular_less_than<index_t>(
+              pr_blockIndexEntries[i].base + BLOCK_SIZE,
+              this->headIndex.load(std::memory_order_relaxed))) {
+            i = (i + 1) & (pr_blockIndexSize - 1);
+          }
+          assert(details::circular_less_than<index_t>(
+              pr_blockIndexEntries[i].base,
+              this->headIndex.load(std::memory_order_relaxed)));
+          halfDequeuedBlock = pr_blockIndexEntries[i].block;
+        }
+
+        // Start at the head block (note the first line in the loop gives us the
+        // head from the tail on the first iteration)
+        auto block = this->tailBlock;
+        do {
+          block = block->next;
+          if (block->ConcurrentQueue::Block::template is_empty<
+                  explicit_context>()) {
+            continue;
+          }
+
+          size_t i = 0;  // Offset into block
+          if (block == halfDequeuedBlock) {
+            i = static_cast<size_t>(
+                this->headIndex.load(std::memory_order_relaxed) &
+                static_cast<index_t>(BLOCK_SIZE - 1));
+          }
+
+          // Walk through all the items in the block; if this is the tail block,
+          // we need to stop when we reach the tail index
+          auto lastValidIndex =
+              (this->tailIndex.load(std::memory_order_relaxed) &
+               static_cast<index_t>(BLOCK_SIZE - 1)) == 0
+                  ? BLOCK_SIZE
+                  : static_cast<size_t>(
+                        this->tailIndex.load(std::memory_order_relaxed) &
+                        static_cast<index_t>(BLOCK_SIZE - 1));
+          while (i != BLOCK_SIZE &&
+                 (block != this->tailBlock || i != lastValidIndex)) {
+            (*block)[i++]->~T();
+          }
+        } while (block != this->tailBlock);
+      }
+
+      // Destroy all blocks that we own
+      if (this->tailBlock != nullptr) {
+        auto block = this->tailBlock;
+        do {
+          auto nextBlock = block->next;
+          this->parent->add_block_to_free_list(block);
+          block = nextBlock;
+        } while (block != this->tailBlock);
+      }
+
+      // Destroy the block indices
+      auto header = static_cast<BlockIndexHeader *>(pr_blockIndexRaw);
+      while (header != nullptr) {
+        auto prev = static_cast<BlockIndexHeader *>(header->prev);
+        header->~BlockIndexHeader();
+        (Traits::free)(header);
+        header = prev;
+      }
+    }
+
+    template <AllocationMode allocMode, typename U>
+    inline bool enqueue(U &&element) {
+      index_t currentTailIndex =
+          this->tailIndex.load(std::memory_order_relaxed);
+      index_t newTailIndex = 1 + currentTailIndex;
+      if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+        // We reached the end of a block, start a new one
+        auto startBlock = this->tailBlock;
+        auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+        if (this->tailBlock != nullptr &&
+            this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
+                explicit_context>()) {
+          // We can re-use the block ahead of us, it's empty!
+          this->tailBlock = this->tailBlock->next;
+          this->tailBlock->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+
+          // We'll put the block on the block index (guaranteed to be room since
+          // we're conceptually removing the last block from it first -- except
+          // instead of removing then adding, we can just overwrite). Note that
+          // there must be a valid block index here, since even if allocation
+          // failed in the ctor, it would have been re-attempted when adding the
+          // first block to the queue; since there is such a block, a block
+          // index must have been successfully allocated.
+        } else {
+          // Whatever head value we see here is >= the last value we saw here
+          // (relatively), and <= its current value. Since we have the most
+          // recent tail, the head must be
+          // <= to it.
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          if (!details::circular_less_than<index_t>(
+                  head, currentTailIndex + BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+            // We can't enqueue in another block because there's not enough
+            // leeway -- the tail could surpass the head by the time the block
+            // fills up! (Or we'll exceed the size limit, if the second part of
+            // the condition was true.)
+            return false;
+          }
+          // We're going to need a new block; check that the block index has
+          // room
+          if (pr_blockIndexRaw == nullptr ||
+              pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+            // Hmm, the circular block index is already full -- we'll need
+            // to allocate a new index. Note pr_blockIndexRaw can only be
+            // nullptr if the initial allocation failed in the constructor.
+
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+              return false;
+            }
+            else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+              return false;
+            }
+          }
+
+          // Insert a new block in the circular linked list
+          auto newBlock =
+              this->parent
+                  ->ConcurrentQueue::template requisition_block<allocMode>();
+          if (newBlock == nullptr) {
+            return false;
+          }
+#ifdef MCDBGQ_TRACKMEM
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+          if (this->tailBlock == nullptr) {
+            newBlock->next = newBlock;
+          } else {
+            newBlock->next = this->tailBlock->next;
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          ++pr_blockIndexSlotsUsed;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          // The constructor may throw. We want the element not to appear in the
+          // queue in that case (without corrupting the queue):
+          MOODYCAMEL_TRY {
+            new ((*this->tailBlock)[currentTailIndex])
+                T(std::forward<U>(element));
+          }
+          MOODYCAMEL_CATCH(...) {
+            // Revert change to the current block, but leave the new block
+            // available for next time
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? this->tailBlock : startBlock;
+            MOODYCAMEL_RETHROW;
+          }
+        }
+        else {
+          (void)startBlock;
+          (void)originalBlockIndexSlotsUsed;
+        }
+
+        // Add block to block index
+        auto &entry = blockIndex.load(std::memory_order_relaxed)
+                          ->entries[pr_blockIndexFront];
+        entry.base = currentTailIndex;
+        entry.block = this->tailBlock;
+        blockIndex.load(std::memory_order_relaxed)
+            ->front.store(pr_blockIndexFront, std::memory_order_release);
+        pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          this->tailIndex.store(newTailIndex, std::memory_order_release);
+          return true;
+        }
+      }
+
+      // Enqueue
+      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename U>
+    bool dequeue(U &element) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      if (details::circular_less_than<index_t>(
+              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit,
+              tail)) {
+        // Might be something to dequeue, let's give it a try
+
+        // Note that this if is purely for performance purposes in the common
+        // case when the queue is empty and the values are eventually consistent
+        // -- we may enter here spuriously.
+
+        // Note that whatever the values of overcommit and tail are, they are
+        // not going to change (unless we change them) and must be the same
+        // value at this point (inside the if) as when the if condition was
+        // evaluated.
+
+        // We insert an acquire fence here to synchronize-with the release upon
+        // incrementing dequeueOvercommit below. This ensures that whatever the
+        // value we got loaded into overcommit, the load of dequeueOptisticCount
+        // in the fetch_add below will result in a value at least as recent as
+        // that (and therefore at least as large). Note that I believe a
+        // compiler (signal) fence here would be sufficient due to the nature of
+        // fetch_add (all read-modify-write operations are guaranteed to work on
+        // the latest value in the modification order), but unfortunately that
+        // can't be shown to be correct using only the C++11 standard. See
+        // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        // Increment optimistic counter, then check if it went over the boundary
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            1, std::memory_order_relaxed);
+
+        // Note that since dequeueOvercommit must be <= dequeueOptimisticCount
+        // (because dequeueOvercommit is only ever incremented after
+        // dequeueOptimisticCount -- this is enforced in the `else` block
+        // below), and since we now have a version of dequeueOptimisticCount
+        // that is at least as recent as overcommit (due to the release upon
+        // incrementing dequeueOvercommit and the acquire above that
+        // synchronizes with it), overcommit <= myDequeueCount. However, we
+        // can't assert this since both dequeueOptimisticCount and
+        // dequeueOvercommit may (independently) overflow; in such a case,
+        // though, the logic still holds since the difference between the two is
+        // maintained.
+
+        // Note that we reload tail here in case it changed; it will be the same
+        // value as before or greater, since this load is sequenced after
+        // (happens after) the earlier load above. This is supported by
+        // read-read coherency (as defined in the standard), explained here:
+        // http://en.cppreference.com/w/cpp/atomic/memory_order
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        if ((details::likely)(details::circular_less_than<index_t>(
+                myDequeueCount - overcommit, tail))) {
+          // Guaranteed to be at least one element to dequeue!
+
+          // Get the index. Note that since there's guaranteed to be at least
+          // one element, this will never exceed tail. We need to do an
+          // acquire-release fence here since it's possible that whatever
+          // condition got us to this point was for an earlier enqueued element
+          // (that we already see the memory effects for), but that by the time
+          // we increment somebody else has incremented it, and we need to see
+          // the memory effects for *that* element, which is in such a case is
+          // necessarily visible on the thread that incremented it in the first
+          // place with the more current condition (they must have acquired a
+          // tail that is at least as recent).
+          auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+
+          // Determine which block the element is in
+
+          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+          auto localBlockIndexHead =
+              localBlockIndex->front.load(std::memory_order_acquire);
+
+          // We need to be careful here about subtracting and dividing because
+          // of index wrap-around. When an index wraps, we need to preserve the
+          // sign of the offset when dividing it by the block size (in order to
+          // get a correct signed block count offset in all cases):
+          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+          auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+          auto offset = static_cast<size_t>(
+              static_cast<typename std::make_signed<index_t>::type>(
+                  blockBaseIndex - headBase) /
+              static_cast<typename std::make_signed<index_t>::type>(
+                  BLOCK_SIZE));
+          auto block = localBlockIndex
+                           ->entries[(localBlockIndexHead + offset) &
+                                     (localBlockIndex->size - 1)]
+                           .block;
+
+          // Dequeue
+          auto &el = *((*block)[index]);
+          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
+            // Make sure the element is still fully dequeued and destroyed even
+            // if the assignment throws
+            struct Guard {
+              Block *block;
+              index_t index;
+
+              ~Guard() {
+                (*block)[index]->~T();
+                block->ConcurrentQueue::Block::template set_empty<
+                    explicit_context>(index);
+              }
+            } guard = {block, index};
+
+            element = std::move(el);  // NOLINT
+          } else {
+            element = std::move(el);  // NOLINT
+            el.~T();                  // NOLINT
+            block->ConcurrentQueue::Block::template set_empty<explicit_context>(
+                index);
+          }
+
+          return true;
+        } else {
+          // Wasn't anything to dequeue after all; make the effective dequeue
+          // count eventually consistent
+          this->dequeueOvercommit.fetch_add(
+              1, std::memory_order_release);  // Release so that the fetch_add
+                                              // on dequeueOptimisticCount is
+                                              // guaranteed to happen before
+                                              // this write
+        }
+      }
+
+      return false;
+    }
+
+    template <AllocationMode allocMode, typename It>
+    bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) {
+      // First, we need to make sure we have enough room to enqueue all of the
+      // elements; this means pre-allocating blocks and putting them in the
+      // block index (but only if all the allocations succeeded).
+      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+      auto startBlock = this->tailBlock;
+      auto originalBlockIndexFront = pr_blockIndexFront;
+      auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+      Block *firstAllocatedBlock = nullptr;
+
+      // Figure out how many blocks we'll need to allocate, and do so
+      size_t blockBaseDiff =
+          ((startTailIndex + count - 1) &
+           ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+          ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+      index_t currentTailIndex =
+          (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+      if (blockBaseDiff > 0) {
+        // Allocate as many blocks as possible from ahead
+        while (blockBaseDiff > 0 && this->tailBlock != nullptr &&
+               this->tailBlock->next != firstAllocatedBlock &&
+               this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
+                   explicit_context>()) {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          this->tailBlock = this->tailBlock->next;
+          firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                    ? this->tailBlock
+                                    : firstAllocatedBlock;
+
+          auto &entry = blockIndex.load(std::memory_order_relaxed)
+                            ->entries[pr_blockIndexFront];
+          entry.base = currentTailIndex;
+          entry.block = this->tailBlock;
+          pr_blockIndexFront =
+              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        // Now allocate as many blocks as necessary from the block pool
+        while (blockBaseDiff > 0) {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          bool full =
+              !details::circular_less_than<index_t>(
+                  head, currentTailIndex + BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+          if (pr_blockIndexRaw == nullptr ||
+              pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+              // Failed to allocate, undo changes (but keep injected blocks)
+              pr_blockIndexFront = originalBlockIndexFront;
+              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+              this->tailBlock =
+                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
+              return false;
+            }
+            else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+              // Failed to allocate, undo changes (but keep injected blocks)
+              pr_blockIndexFront = originalBlockIndexFront;
+              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+              this->tailBlock =
+                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
+              return false;
+            }
+
+            // pr_blockIndexFront is updated inside new_block_index, so we need
+            // to update our fallback value too (since we keep the new index
+            // even if we later fail)
+            originalBlockIndexFront = originalBlockIndexSlotsUsed;
+          }
+
+          // Insert a new block in the circular linked list
+          auto newBlock =
+              this->parent
+                  ->ConcurrentQueue::template requisition_block<allocMode>();
+          if (newBlock == nullptr) {
+            pr_blockIndexFront = originalBlockIndexFront;
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? firstAllocatedBlock : startBlock;
+            return false;
+          }
+
+#ifdef MCDBGQ_TRACKMEM
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template set_all_empty<
+              explicit_context>();
+          if (this->tailBlock == nullptr) {
+            newBlock->next = newBlock;
+          } else {
+            newBlock->next = this->tailBlock->next;
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                    ? this->tailBlock
+                                    : firstAllocatedBlock;
+
+          ++pr_blockIndexSlotsUsed;
+
+          auto &entry = blockIndex.load(std::memory_order_relaxed)
+                            ->entries[pr_blockIndexFront];
+          entry.base = currentTailIndex;
+          entry.block = this->tailBlock;
+          pr_blockIndexFront =
+              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        // Excellent, all allocations succeeded. Reset each block's emptiness
+        // before we fill them up, and publish the new block index front
+        auto block = firstAllocatedBlock;
+        while (true) {
+          block->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+          if (block == this->tailBlock) {
+            break;
+          }
+          block = block->next;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          blockIndex.load(std::memory_order_relaxed)
+              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                            std::memory_order_release);
+        }
+      }
+
+      // Enqueue, one block at a time
+      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+      currentTailIndex = startTailIndex;
+      auto endBlock = this->tailBlock;
+      this->tailBlock = startBlock;
+      assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+             firstAllocatedBlock != nullptr || count == 0);
+      if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+          firstAllocatedBlock != nullptr) {
+        this->tailBlock = firstAllocatedBlock;
+      }
+      while (true) {
+        index_t stopIndex =
+            (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+            static_cast<index_t>(BLOCK_SIZE);
+        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+          stopIndex = newTailIndex;
+        }
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          while (currentTailIndex != stopIndex) {
+            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+          }
+        }
+        else {
+          MOODYCAMEL_TRY {
+            while (currentTailIndex != stopIndex) {
+              // Must use copy constructor even if move constructor is available
+              // because we may have to revert if there's an exception.
+              // Sorry about the horrible templated next line, but it was the
+              // only way to disable moving *at compile time*, which is
+              // important because a type may only define a (noexcept) move
+              // constructor, and so calls to the cctor will not compile, even
+              // if they are in an if branch that will never be executed
+              new ((*this->tailBlock)[currentTailIndex]) T(
+                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                      T, decltype(*itemFirst),
+                      new (static_cast<T *>(nullptr)) T(details::deref_noexcept(
+                          itemFirst)))>::eval(*itemFirst));
+              ++currentTailIndex;
+              ++itemFirst;
+            }
+          }
+          MOODYCAMEL_CATCH(...) {
+            // Oh dear, an exception's been thrown -- destroy the elements that
+            // were enqueued so far and revert the entire bulk operation (we'll
+            // keep any allocated blocks in our linked list for later, though).
+            auto constructedStopIndex = currentTailIndex;
+            auto lastBlockEnqueued = this->tailBlock;
+
+            pr_blockIndexFront = originalBlockIndexFront;
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? firstAllocatedBlock : startBlock;
+
+            if (!details::is_trivially_destructible<T>::value) {
+              auto block = startBlock;
+              if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) ==
+                  0) {
+                block = firstAllocatedBlock;
+              }
+              currentTailIndex = startTailIndex;
+              while (true) {
+                stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(constructedStopIndex,
+                                                         stopIndex)) {
+                  stopIndex = constructedStopIndex;
+                }
+                while (currentTailIndex != stopIndex) {
+                  (*block)[currentTailIndex++]->~T();
+                }
+                if (block == lastBlockEnqueued) {
+                  break;
+                }
+                block = block->next;
+              }
+            }
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        if (this->tailBlock == endBlock) {
+          assert(currentTailIndex == newTailIndex);
+          break;
+        }
+        this->tailBlock = this->tailBlock->next;
+      }
+
+      MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+          T, decltype(*itemFirst),
+          new (static_cast<T *>(nullptr))
+              T(details::deref_noexcept(itemFirst)))) {
+        if (firstAllocatedBlock != nullptr)
+          blockIndex.load(std::memory_order_relaxed)
+              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                            std::memory_order_release);
+      }
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename It>
+    size_t dequeue_bulk(It &itemFirst, size_t max) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      auto desiredCount = static_cast<size_t>(
+          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit));
+      if (details::circular_less_than<size_t>(0, desiredCount)) {
+        desiredCount = desiredCount < max ? desiredCount : max;
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            desiredCount, std::memory_order_relaxed);
+
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        auto actualCount =
+            static_cast<size_t>(tail - (myDequeueCount - overcommit));
+        if (details::circular_less_than<size_t>(0, actualCount)) {
+          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+          if (actualCount < desiredCount) {
+            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
+                                              std::memory_order_release);
+          }
+
+          // Get the first index. Note that since there's guaranteed to be at
+          // least actualCount elements, this will never exceed tail.
+          auto firstIndex =
+              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+          // Determine which block the first element is in
+          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+          auto localBlockIndexHead =
+              localBlockIndex->front.load(std::memory_order_acquire);
+
+          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+          auto firstBlockBaseIndex =
+              firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+          auto offset = static_cast<size_t>(
+              static_cast<typename std::make_signed<index_t>::type>(
+                  firstBlockBaseIndex - headBase) /
+              static_cast<typename std::make_signed<index_t>::type>(
+                  BLOCK_SIZE));
+          auto indexIndex =
+              (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+          // Iterate the blocks and dequeue
+          auto index = firstIndex;
+          do {
+            auto firstIndexInBlock = index;
+            index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                               static_cast<index_t>(BLOCK_SIZE);
+            endIndex =
+                details::circular_less_than<index_t>(
+                    firstIndex + static_cast<index_t>(actualCount), endIndex)
+                    ? firstIndex + static_cast<index_t>(actualCount)
+                    : endIndex;
+            auto block = localBlockIndex->entries[indexIndex].block;
+            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&,
+                                           details::deref_noexcept(itemFirst) =
+                                               std::move((*(*block)[index])))) {
+              while (index != endIndex) {
+                auto &el = *((*block)[index]);
+                *itemFirst++ = std::move(el);
+                el.~T();
+                ++index;
+              }
+            } else {
+              MOODYCAMEL_TRY {
+                while (index != endIndex) {
+                  auto &el = *((*block)[index]);
+                  *itemFirst = std::move(el);
+                  ++itemFirst;
+                  el.~T();
+                  ++index;
+                }
+              }
+              MOODYCAMEL_CATCH(...) {
+                // It's too late to revert the dequeue, but we can make sure
+                // that all the dequeued objects are properly destroyed and the
+                // block index (and empty count) are properly updated before we
+                // propagate the exception
+                do {
+                  block = localBlockIndex->entries[indexIndex].block;
+                  while (index != endIndex) {
+                    (*block)[index++]->~T();
+                  }
+                  block->ConcurrentQueue::Block::template set_many_empty<
+                      explicit_context>(
+                      firstIndexInBlock,
+                      static_cast<size_t>(endIndex - firstIndexInBlock));
+                  indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+
+                  firstIndexInBlock = index;
+                  endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                             static_cast<index_t>(BLOCK_SIZE);
+                  endIndex =
+                      details::circular_less_than<index_t>(
+                          firstIndex + static_cast<index_t>(actualCount),
+                          endIndex)
+                          ? firstIndex + static_cast<index_t>(actualCount)
+                          : endIndex;
+                } while (index != firstIndex + actualCount);
+
+                MOODYCAMEL_RETHROW;
+              }
+            }
+            block->ConcurrentQueue::Block::template set_many_empty<
+                explicit_context>(
+                firstIndexInBlock,
+                static_cast<size_t>(endIndex - firstIndexInBlock));
+            indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+          } while (index != firstIndex + actualCount);
+
+          return actualCount;
+        } else {
+          // Wasn't anything to dequeue after all; make the effective dequeue
+          // count eventually consistent
+          this->dequeueOvercommit.fetch_add(desiredCount,
+                                            std::memory_order_release);
+        }
+      }
+
+      return 0;
+    }
+
+   private:
+    struct BlockIndexEntry {
+      index_t base;
+      Block *block;
+    };
+
+    struct BlockIndexHeader {
+      size_t size;
+      std::atomic<size_t>
+          front;  // Current slot (not next, like pr_blockIndexFront)
+      BlockIndexEntry *entries;
+      void *prev;
+    };
+
+
+    bool new_block_index(size_t numberOfFilledSlotsToExpose) {
+      auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+      // Create the new block
+      pr_blockIndexSize <<= 1;
+      auto newRawPtr = static_cast<char *>(
+          (Traits::malloc)(sizeof(BlockIndexHeader) +
+                           std::alignment_of<BlockIndexEntry>::value - 1 +
+                           sizeof(BlockIndexEntry) * pr_blockIndexSize));
+      if (newRawPtr == nullptr) {
+        pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
+        return false;
+      }
+
+      auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry *>(
+          details::align_for<BlockIndexEntry>(newRawPtr +
+                                              sizeof(BlockIndexHeader)));
+
+      // Copy in all the old indices, if any
+      size_t j = 0;
+      if (pr_blockIndexSlotsUsed != 0) {
+        auto i =
+            (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+        do {
+          newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+          i = (i + 1) & prevBlockSizeMask;
+        } while (i != pr_blockIndexFront);
+      }
+
+      // Update everything
+      auto header = new (newRawPtr) BlockIndexHeader;
+      header->size = pr_blockIndexSize;
+      header->front.store(numberOfFilledSlotsToExpose - 1,
+                          std::memory_order_relaxed);
+      header->entries = newBlockIndexEntries;
+      header->prev = pr_blockIndexRaw;  // we link the new block to the old one
+                                        // so we can free it later
+
+      pr_blockIndexFront = j;
+      pr_blockIndexEntries = newBlockIndexEntries;
+      pr_blockIndexRaw = newRawPtr;
+      blockIndex.store(header, std::memory_order_release);
+
+      return true;
+    }
+
+   private:
+    std::atomic<BlockIndexHeader *> blockIndex;
+
+    // To be used by producer only -- consumer must use the ones in referenced
+    // by blockIndex
+    size_t pr_blockIndexSlotsUsed;
+    size_t pr_blockIndexSize;
+    size_t pr_blockIndexFront;  // Next slot (not current)
+    BlockIndexEntry *pr_blockIndexEntries;
+    void *pr_blockIndexRaw;
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+   public:
+    ExplicitProducer *nextExplicitProducer;
+
+   private:
+#endif
+
+#ifdef MCDBGQ_TRACKMEM
+    friend struct MemStats;
+#endif
+  };
+
+
+  //////////////////////////////////
+  // Implicit queue
+  //////////////////////////////////
+
+  struct ImplicitProducer : public ProducerBase {
+    ImplicitProducer(ConcurrentQueue *parent_)
+        : ProducerBase(parent_, false),
+          nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+          blockIndex(nullptr) {
+      new_block_index();
+    }
+
+    ~ImplicitProducer() {
+      // Note that since we're in the destructor we can assume that all
+      // enqueue/dequeue operations completed already; this means that all
+      // undequeued elements are placed contiguously across contiguous blocks,
+      // and that only the first and last remaining blocks can be only partially
+      // empty (all other remaining blocks must be completely full).
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+      // Unregister ourselves for thread termination notification
+      if (!this->inactive.load(std::memory_order_relaxed)) {
+        details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+      }
+#endif
+
+      // Destroy all remaining elements!
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto index = this->headIndex.load(std::memory_order_relaxed);
+      Block *block = nullptr;
+      assert(index == tail || details::circular_less_than(index, tail));
+      bool forceFreeLastBlock =
+          index != tail;  // If we enter the loop, then the last (tail) block
+                          // will not be freed
+      while (index != tail) {
+        if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ||
+            block == nullptr) {
+          if (block != nullptr) {
+            // Free the old block
+            this->parent->add_block_to_free_list(block);
+          }
+
+          block = get_block_index_entry_for_index(index)->value.load(
+              std::memory_order_relaxed);
+        }
+
+        ((*block)[index])->~T();
+        ++index;
+      }
+      // Even if the queue is empty, there's still one block that's not on the
+      // free list (unless the head index reached the end of it, in which case
+      // the tail will be poised to create a new block).
+      if (this->tailBlock != nullptr &&
+          (forceFreeLastBlock ||
+           (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+        this->parent->add_block_to_free_list(this->tailBlock);
+      }
+
+      // Destroy block index
+      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+      if (localBlockIndex != nullptr) {
+        for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+          localBlockIndex->index[i]->~BlockIndexEntry();
+        }
+        do {
+          auto prev = localBlockIndex->prev;
+          localBlockIndex->~BlockIndexHeader();
+          (Traits::free)(localBlockIndex);
+          localBlockIndex = prev;
+        } while (localBlockIndex != nullptr);
+      }
+    }
+
+    template <AllocationMode allocMode, typename U>
+    inline bool enqueue(U &&element) {
+      index_t currentTailIndex =
+          this->tailIndex.load(std::memory_order_relaxed);
+      index_t newTailIndex = 1 + currentTailIndex;
+      if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+        // We reached the end of a block, start a new one
+        auto head = this->headIndex.load(std::memory_order_relaxed);
+        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+        if (!details::circular_less_than<index_t>(
+                head, currentTailIndex + BLOCK_SIZE) ||
+            (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+             (MAX_SUBQUEUE_SIZE == 0 ||
+              MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+          return false;
+        }
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+        debug::DebugLock lock(mutex);
+#endif
+        // Find out where we'll be inserting this block in the block index
+        BlockIndexEntry *idxEntry;
+        if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+          return false;
+        }
+
+        // Get ahold of a new block
+        auto newBlock =
+            this->parent
+                ->ConcurrentQueue::template requisition_block<allocMode>();
+        if (newBlock == nullptr) {
+          rewind_block_index_tail();
+          idxEntry->value.store(nullptr, std::memory_order_relaxed);
+          return false;
+        }
+#ifdef MCDBGQ_TRACKMEM
+        newBlock->owner = this;
+#endif
+        newBlock
+            ->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          // May throw, try to insert now before we publish the fact that we
+          // have this new block
+          MOODYCAMEL_TRY {
+            new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+          }
+          MOODYCAMEL_CATCH(...) {
+            rewind_block_index_tail();
+            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+            this->parent->add_block_to_free_list(newBlock);
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        // Insert the new block into the index
+        idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+        this->tailBlock = newBlock;
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          this->tailIndex.store(newTailIndex, std::memory_order_release);
+          return true;
+        }
+      }
+
+      // Enqueue
+      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename U>
+    bool dequeue(U &element) {
+      // See ExplicitProducer::dequeue for rationale and explanation
+      index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+      index_t overcommit =
+          this->dequeueOvercommit.load(std::memory_order_relaxed);
+      if (details::circular_less_than<index_t>(
+              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit,
+              tail)) {
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            1, std::memory_order_relaxed);
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        if ((details::likely)(details::circular_less_than<index_t>(
+                myDequeueCount - overcommit, tail))) {
+          index_t index =
+              this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+          // Determine which block the element is in
+          auto entry = get_block_index_entry_for_index(index);
+
+          // Dequeue
+          auto block = entry->value.load(std::memory_order_relaxed);
+          auto &el = *((*block)[index]);
+
+          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+            // Note: Acquiring the mutex with every dequeue instead of only when
+            // a block is released is very sub-optimal, but it is, after all,
+            // purely debug code.
+            debug::DebugLock lock(producer->mutex);
+#endif
+            struct Guard {
+              Block *block;
+              index_t index;
+              BlockIndexEntry *entry;
+              ConcurrentQueue *parent;
+
+              ~Guard() {
+                (*block)[index]->~T();
+                if (block->ConcurrentQueue::Block::template set_empty<
+                        implicit_context>(index)) {
+                  entry->value.store(nullptr, std::memory_order_relaxed);
+                  parent->add_block_to_free_list(block);
+                }
+              }
+            } guard = {block, index, entry, this->parent};
+
+            element = std::move(el);  // NOLINT
+          } else {
+            element = std::move(el);  // NOLINT
+            el.~T();                  // NOLINT
+
+            if (block->ConcurrentQueue::Block::template set_empty<
+                    implicit_context>(index)) {
+              {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                // Add the block back into the global free pool (and remove from
+                // block index)
+                entry->value.store(nullptr, std::memory_order_relaxed);
+              }
+              this->parent->add_block_to_free_list(
+                  block);  // releases the above store
+            }
+          }
+
+          return true;
+        } else {
+          this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+        }
+      }
+
+      return false;
+    }
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4706)  // assignment within conditional expression
+#endif
+    template <AllocationMode allocMode, typename It>
+    bool enqueue_bulk(It itemFirst, size_t count) {
+      // First, we need to make sure we have enough room to enqueue all of the
+      // elements; this means pre-allocating blocks and putting them in the
+      // block index (but only if all the allocations succeeded).
+
+      // Note that the tailBlock we start off with may not be owned by us any
+      // more; this happens if it was filled up exactly to the top (setting
+      // tailIndex to the first index of the next block which is not yet
+      // allocated), then dequeued completely (putting it on the free list)
+      // before we enqueue again.
+
+      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+      auto startBlock = this->tailBlock;
+      Block *firstAllocatedBlock = nullptr;
+      auto endBlock = this->tailBlock;
+
+      // Figure out how many blocks we'll need to allocate, and do so
+      size_t blockBaseDiff =
+          ((startTailIndex + count - 1) &
+           ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+          ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+      index_t currentTailIndex =
+          (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+      if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+        debug::DebugLock lock(mutex);
+#endif
+        do {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          // Find out where we'll be inserting this block in the block index
+          BlockIndexEntry *idxEntry =
+              nullptr;  // initialization here unnecessary but compiler can't
+                        // always tell
+          Block *newBlock;
+          bool indexInserted = false;
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          bool full =
+              !details::circular_less_than<index_t>(
+                  head, currentTailIndex + BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+          if (full ||
+              !(indexInserted = insert_block_index_entry<allocMode>(
+                    idxEntry, currentTailIndex)) ||
+              (newBlock =
+                   this->parent->ConcurrentQueue::template requisition_block<
+                       allocMode>()) == nullptr) {
+            // Index allocation or block allocation failed; revert any other
+            // allocations and index insertions done so far for this operation
+            if (indexInserted) {
+              rewind_block_index_tail();
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+            }
+            currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            for (auto block = firstAllocatedBlock; block != nullptr;
+                 block = block->next) {
+              currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+              idxEntry = get_block_index_entry_for_index(currentTailIndex);
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+              rewind_block_index_tail();
+            }
+            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+            this->tailBlock = startBlock;
+
+            return false;
+          }
+
+#ifdef MCDBGQ_TRACKMEM
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template reset_empty<
+              implicit_context>();
+          newBlock->next = nullptr;
+
+          // Insert the new block into the index
+          idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+          // Store the chain of blocks so that we can undo if later allocations
+          // fail, and so that we can find the blocks when we do the actual
+          // enqueueing
+          if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+              firstAllocatedBlock != nullptr) {
+            assert(this->tailBlock != nullptr);
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          endBlock = newBlock;
+          firstAllocatedBlock =
+              firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+        } while (blockBaseDiff > 0);
+      }
+
+      // Enqueue, one block at a time
+      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+      currentTailIndex = startTailIndex;
+      this->tailBlock = startBlock;
+      assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+             firstAllocatedBlock != nullptr || count == 0);
+      if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+          firstAllocatedBlock != nullptr) {
+        this->tailBlock = firstAllocatedBlock;
+      }
+      while (true) {
+        index_t stopIndex =
+            (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+            static_cast<index_t>(BLOCK_SIZE);
+        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+          stopIndex = newTailIndex;
+        }
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          while (currentTailIndex != stopIndex) {
+            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+          }
+        }
+        else {
+          MOODYCAMEL_TRY {
+            while (currentTailIndex != stopIndex) {
+              new ((*this->tailBlock)[currentTailIndex]) T(
+                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                      T, decltype(*itemFirst),
+                      new (static_cast<T *>(nullptr)) T(details::deref_noexcept(
+                          itemFirst)))>::eval(*itemFirst));
+              ++currentTailIndex;
+              ++itemFirst;
+            }
+          }
+          MOODYCAMEL_CATCH(...) {
+            auto constructedStopIndex = currentTailIndex;
+            auto lastBlockEnqueued = this->tailBlock;
+
+            if (!details::is_trivially_destructible<T>::value) {
+              auto block = startBlock;
+              if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) ==
+                  0) {
+                block = firstAllocatedBlock;
+              }
+              currentTailIndex = startTailIndex;
+              while (true) {
+                stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(constructedStopIndex,
+                                                         stopIndex)) {
+                  stopIndex = constructedStopIndex;
+                }
+                while (currentTailIndex != stopIndex) {
+                  (*block)[currentTailIndex++]->~T();
+                }
+                if (block == lastBlockEnqueued) {
+                  break;
+                }
+                block = block->next;
+              }
+            }
+
+            currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            for (auto block = firstAllocatedBlock; block != nullptr;
+                 block = block->next) {
+              currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+              auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+              rewind_block_index_tail();
+            }
+            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+            this->tailBlock = startBlock;
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        if (this->tailBlock == endBlock) {
+          assert(currentTailIndex == newTailIndex);
+          break;
+        }
+        this->tailBlock = this->tailBlock->next;
+      }
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+    template <typename It>
+    size_t dequeue_bulk(It &itemFirst, size_t max) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      auto desiredCount = static_cast<size_t>(
+          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit));
+      if (details::circular_less_than<size_t>(0, desiredCount)) {
+        desiredCount = desiredCount < max ? desiredCount : max;
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            desiredCount, std::memory_order_relaxed);
+
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        auto actualCount =
+            static_cast<size_t>(tail - (myDequeueCount - overcommit));
+        if (details::circular_less_than<size_t>(0, actualCount)) {
+          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+          if (actualCount < desiredCount) {
+            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
+                                              std::memory_order_release);
+          }
+
+          // Get the first index. Note that since there's guaranteed to be at
+          // least actualCount elements, this will never exceed tail.
+          auto firstIndex =
+              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+          // Iterate the blocks and dequeue
+          auto index = firstIndex;
+          BlockIndexHeader *localBlockIndex;
+          auto indexIndex =
+              get_block_index_index_for_index(index, localBlockIndex);
+          do {
+            auto blockStartIndex = index;
+            index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                               static_cast<index_t>(BLOCK_SIZE);
+            endIndex =
+                details::circular_less_than<index_t>(
+                    firstIndex + static_cast<index_t>(actualCount), endIndex)
+                    ? firstIndex + static_cast<index_t>(actualCount)
+                    : endIndex;
+
+            auto entry = localBlockIndex->index[indexIndex];
+            auto block = entry->value.load(std::memory_order_relaxed);
+            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&,
+                                           details::deref_noexcept(itemFirst) =
+                                               std::move((*(*block)[index])))) {
+              while (index != endIndex) {
+                auto &el = *((*block)[index]);
+                *itemFirst++ = std::move(el);
+                el.~T();
+                ++index;
+              }
+            } else {
+              MOODYCAMEL_TRY {
+                while (index != endIndex) {
+                  auto &el = *((*block)[index]);
+                  *itemFirst = std::move(el);
+                  ++itemFirst;
+                  el.~T();
+                  ++index;
+                }
+              }
+              MOODYCAMEL_CATCH(...) {
+                do {
+                  entry = localBlockIndex->index[indexIndex];
+                  block = entry->value.load(std::memory_order_relaxed);
+                  while (index != endIndex) {
+                    (*block)[index++]->~T();
+                  }
+
+                  if (block->ConcurrentQueue::Block::template set_many_empty<
+                          implicit_context>(
+                          blockStartIndex,
+                          static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                    debug::DebugLock lock(mutex);
+#endif
+                    entry->value.store(nullptr, std::memory_order_relaxed);
+                    this->parent->add_block_to_free_list(block);
+                  }
+                  indexIndex =
+                      (indexIndex + 1) & (localBlockIndex->capacity - 1);
+
+                  blockStartIndex = index;
+                  endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                             static_cast<index_t>(BLOCK_SIZE);
+                  endIndex =
+                      details::circular_less_than<index_t>(
+                          firstIndex + static_cast<index_t>(actualCount),
+                          endIndex)
+                          ? firstIndex + static_cast<index_t>(actualCount)
+                          : endIndex;
+                } while (index != firstIndex + actualCount);
+
+                MOODYCAMEL_RETHROW;
+              }
+            }
+            if (block->ConcurrentQueue::Block::template set_many_empty<
+                    implicit_context>(
+                    blockStartIndex,
+                    static_cast<size_t>(endIndex - blockStartIndex))) {
+              {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+                debug::DebugLock lock(mutex);
+#endif
+                // Note that the set_many_empty above did a release, meaning
+                // that anybody who acquires the block we're about to free can
+                // use it safely since our writes (and reads!) will have
+                // happened-before then.
+                entry->value.store(nullptr, std::memory_order_relaxed);
+              }
+              this->parent->add_block_to_free_list(
+                  block);  // releases the above store
+            }
+            indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+          } while (index != firstIndex + actualCount);
+
+          return actualCount;
+        } else {
+          this->dequeueOvercommit.fetch_add(desiredCount,
+                                            std::memory_order_release);
+        }
+      }
+
+      return 0;
+    }
+
+   private:
+    // The block size must be > 1, so any number with the low bit set is an
+    // invalid block base index
+    static const index_t INVALID_BLOCK_BASE = 1;
+
+    struct BlockIndexEntry {
+      std::atomic<index_t> key;
+      std::atomic<Block *> value;
+    };
+
+    struct BlockIndexHeader {
+      size_t capacity;
+      std::atomic<size_t> tail;
+      BlockIndexEntry *entries;
+      BlockIndexEntry **index;
+      BlockIndexHeader *prev;
+    };
+
+    template <AllocationMode allocMode>
+    inline bool insert_block_index_entry(BlockIndexEntry *&idxEntry,
+                                         index_t blockStartIndex) {
+      auto localBlockIndex =
+          blockIndex.load(std::memory_order_relaxed);  // We're the only writer
+                                                       // thread, relaxed is OK
+      if (localBlockIndex == nullptr) {
+        return false;  // this can happen if new_block_index failed in the
+                       // constructor
+      }
+      size_t newTail =
+          (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+          (localBlockIndex->capacity - 1);
+      idxEntry = localBlockIndex->index[newTail];
+      if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+          idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+        localBlockIndex->tail.store(newTail, std::memory_order_release);
+        return true;
+      }
+
+      // No room in the old block index, try to allocate another one!
+      MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+        return false;
+      }
+      else if (!new_block_index()) {
+        return false;
+      }
+      else {
+        localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+        newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+                  (localBlockIndex->capacity - 1);
+        idxEntry = localBlockIndex->index[newTail];
+        assert(idxEntry->key.load(std::memory_order_relaxed) ==
+               INVALID_BLOCK_BASE);
+        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+        localBlockIndex->tail.store(newTail, std::memory_order_release);
+        return true;
+      }
+    }
+
+    inline void rewind_block_index_tail() {
+      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+      localBlockIndex->tail.store(
+          (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) &
+              (localBlockIndex->capacity - 1),
+          std::memory_order_relaxed);
+    }
+
+    inline BlockIndexEntry *get_block_index_entry_for_index(
+        index_t index) const {
+      BlockIndexHeader *localBlockIndex;
+      auto idx = get_block_index_index_for_index(index, localBlockIndex);
+      return localBlockIndex->index[idx];
+    }
+
+    inline size_t get_block_index_index_for_index(
+        index_t index, BlockIndexHeader *&localBlockIndex) const {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+      debug::DebugLock lock(mutex);
+#endif
+      index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+      localBlockIndex = blockIndex.load(std::memory_order_acquire);
+      auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+      auto tailBase =
+          localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+      assert(tailBase != INVALID_BLOCK_BASE);
+      // Note: Must use division instead of shift because the index may wrap
+      // around, causing a negative offset, whose negativity we want to preserve
+      auto offset = static_cast<size_t>(
+          static_cast<typename std::make_signed<index_t>::type>(index -
+                                                                tailBase) /
+          static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+      size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+      assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) ==
+                 index &&
+             localBlockIndex->index[idx]->value.load(
+                 std::memory_order_relaxed) != nullptr);
+      return idx;
+    }
+
+    bool new_block_index() {
+      auto prev = blockIndex.load(std::memory_order_relaxed);
+      size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+      auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+      auto raw = static_cast<char *>(
+          (Traits::malloc)(sizeof(BlockIndexHeader) +
+                           std::alignment_of<BlockIndexEntry>::value - 1 +
+                           sizeof(BlockIndexEntry) * entryCount +
+                           std::alignment_of<BlockIndexEntry *>::value - 1 +
+                           sizeof(BlockIndexEntry *) * nextBlockIndexCapacity));
+      if (raw == nullptr) {
+        return false;
+      }
+
+      auto header = new (raw) BlockIndexHeader;
+      auto entries = reinterpret_cast<BlockIndexEntry *>(
+          details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+      auto index = reinterpret_cast<BlockIndexEntry **>(
+          details::align_for<BlockIndexEntry *>(
+              reinterpret_cast<char *>(entries) +
+              sizeof(BlockIndexEntry) * entryCount));
+      if (prev != nullptr) {
+        auto prevTail = prev->tail.load(std::memory_order_relaxed);
+        auto prevPos = prevTail;
+        size_t i = 0;
+        do {
+          prevPos = (prevPos + 1) & (prev->capacity - 1);
+          index[i++] = prev->index[prevPos];
+        } while (prevPos != prevTail);
+        assert(i == prevCapacity);
+      }
+      for (size_t i = 0; i != entryCount; ++i) {
+        new (entries + i) BlockIndexEntry;
+        entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+        index[prevCapacity + i] = entries + i;
+      }
+      header->prev = prev;
+      header->entries = entries;
+      header->index = index;
+      header->capacity = nextBlockIndexCapacity;
+      header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1),
+                         std::memory_order_relaxed);
+
+      blockIndex.store(header, std::memory_order_release);
+
+      nextBlockIndexCapacity <<= 1;
+
+      return true;
+    }
+
+   private:
+    size_t nextBlockIndexCapacity;
+    std::atomic<BlockIndexHeader *> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+   public:
+    details::ThreadExitListener threadExitListener;
+
+   private:
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+   public:
+    ImplicitProducer *nextImplicitProducer;
+
+   private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+    mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+    friend struct MemStats;
+#endif
+  };
+
+
+  //////////////////////////////////
+  // Block pool manipulation
+  //////////////////////////////////
+
+  void populate_initial_block_list(size_t blockCount) {
+    initialBlockPoolSize = blockCount;
+    if (initialBlockPoolSize == 0) {
+      initialBlockPool = nullptr;
+      return;
+    }
+
+    initialBlockPool = create_array<Block>(blockCount);
+    if (initialBlockPool == nullptr) {
+      initialBlockPoolSize = 0;
+    }
+    for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+      initialBlockPool[i].dynamicallyAllocated = false;
+    }
+  }
+
+  inline Block *try_get_block_from_initial_pool() {
+    if (initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+        initialBlockPoolSize) {
+      return nullptr;
+    }
+
+    auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+    return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+  }
+
+  inline void add_block_to_free_list(Block *block) {
+#ifdef MCDBGQ_TRACKMEM
+    block->owner = nullptr;
+#endif
+    if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+      destroy(block);
+    } else {
+      freeList.add(block);
+    }
+  }
+
+  inline void add_blocks_to_free_list(Block *block) {
+    while (block != nullptr) {
+      auto next = block->next;
+      add_block_to_free_list(block);
+      block = next;
+    }
+  }
+
+  inline Block *try_get_block_from_free_list() {
+    return freeList.try_get();
+  }
+
+  // Gets a free block from one of the memory pools, or allocates a new one (if
+  // applicable)
+  template <AllocationMode canAlloc>
+  Block *requisition_block() {
+    auto block = try_get_block_from_initial_pool();
+    if (block != nullptr) {
+      return block;
+    }
+
+    block = try_get_block_from_free_list();
+    if (block != nullptr) {
+      return block;
+    }
+
+    MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) {
+      return create<Block>();
+    }
+    else {
+      return nullptr;
+    }
+  }
+
+
+#ifdef MCDBGQ_TRACKMEM
+ public:
+  struct MemStats {
+    size_t allocatedBlocks;
+    size_t usedBlocks;
+    size_t freeBlocks;
+    size_t ownedBlocksExplicit;
+    size_t ownedBlocksImplicit;
+    size_t implicitProducers;
+    size_t explicitProducers;
+    size_t elementsEnqueued;
+    size_t blockClassBytes;
+    size_t queueClassBytes;
+    size_t implicitBlockIndexBytes;
+    size_t explicitBlockIndexBytes;
+
+    friend class ConcurrentQueue;
+
+   private:
+    static MemStats getFor(ConcurrentQueue *q) {
+      MemStats stats = {0};
+
+      stats.elementsEnqueued = q->size_approx();
+
+      auto block = q->freeList.head_unsafe();
+      while (block != nullptr) {
+        ++stats.allocatedBlocks;
+        ++stats.freeBlocks;
+        block = block->freeListNext.load(std::memory_order_relaxed);
+      }
+
+      for (auto ptr = q->producerListTail.load(std::memory_order_acquire);
+           ptr != nullptr; ptr = ptr->next_prod()) {
+        bool implicit = dynamic_cast<ImplicitProducer *>(ptr) != nullptr;
+        stats.implicitProducers += implicit ? 1 : 0;
+        stats.explicitProducers += implicit ? 0 : 1;
+
+        if (implicit) {
+          auto prod = static_cast<ImplicitProducer *>(ptr);
+          stats.queueClassBytes += sizeof(ImplicitProducer);
+          auto head = prod->headIndex.load(std::memory_order_relaxed);
+          auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+          auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+          if (hash != nullptr) {
+            for (size_t i = 0; i != hash->capacity; ++i) {
+              if (hash->index[i]->key.load(std::memory_order_relaxed) !=
+                      ImplicitProducer::INVALID_BLOCK_BASE &&
+                  hash->index[i]->value.load(std::memory_order_relaxed) !=
+                      nullptr) {
+                ++stats.allocatedBlocks;
+                ++stats.ownedBlocksImplicit;
+              }
+            }
+            stats.implicitBlockIndexBytes +=
+                hash->capacity *
+                sizeof(typename ImplicitProducer::BlockIndexEntry);
+            for (; hash != nullptr; hash = hash->prev) {
+              stats.implicitBlockIndexBytes +=
+                  sizeof(typename ImplicitProducer::BlockIndexHeader) +
+                  hash->capacity *
+                      sizeof(typename ImplicitProducer::BlockIndexEntry *);
+            }
+          }
+          for (; details::circular_less_than<index_t>(head, tail);
+               head += BLOCK_SIZE) {
+            // auto block = prod->get_block_index_entry_for_index(head);
+            ++stats.usedBlocks;
+          }
+        } else {
+          auto prod = static_cast<ExplicitProducer *>(ptr);
+          stats.queueClassBytes += sizeof(ExplicitProducer);
+          auto tailBlock = prod->tailBlock;
+          bool wasNonEmpty = false;
+          if (tailBlock != nullptr) {
+            auto block = tailBlock;
+            do {
+              ++stats.allocatedBlocks;
+              if (!block->ConcurrentQueue::Block::template is_empty<
+                      explicit_context>() ||
+                  wasNonEmpty) {
+                ++stats.usedBlocks;
+                wasNonEmpty = wasNonEmpty || block != tailBlock;
+              }
+              ++stats.ownedBlocksExplicit;
+              block = block->next;
+            } while (block != tailBlock);
+          }
+          auto index = prod->blockIndex.load(std::memory_order_relaxed);
+          while (index != nullptr) {
+            stats.explicitBlockIndexBytes +=
+                sizeof(typename ExplicitProducer::BlockIndexHeader) +
+                index->size *
+                    sizeof(typename ExplicitProducer::BlockIndexEntry);
+            index = static_cast<typename ExplicitProducer::BlockIndexHeader *>(
+                index->prev);
+          }
+        }
+      }
+
+      auto freeOnInitialPool =
+          q->initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+                  q->initialBlockPoolSize
+              ? 0
+              : q->initialBlockPoolSize -
+                    q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+      stats.allocatedBlocks += freeOnInitialPool;
+      stats.freeBlocks += freeOnInitialPool;
+
+      stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+      stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+      return stats;
+    }
+  };
+
+  // For debugging only. Not thread-safe.
+  MemStats getMemStats() {
+    return MemStats::getFor(this);
+  }
+
+ private:
+  friend struct MemStats;
+#endif
+
+
+  //////////////////////////////////
+  // Producer list manipulation
+  //////////////////////////////////
+
+  ProducerBase *recycle_or_create_producer(bool isExplicit) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+    // Try to re-use one first
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      if (ptr->inactive.load(std::memory_order_relaxed) &&
+          ptr->isExplicit == isExplicit) {
+        bool expected = true;
+        if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false,
+                                                  std::memory_order_acquire,
+                                                  std::memory_order_relaxed)) {
+          // We caught one! It's been marked as activated, the caller can have
+          // it
+          return ptr;
+        }
+      }
+    }
+
+    return add_producer(
+        isExplicit ? static_cast<ProducerBase *>(create<ExplicitProducer>(this))
+                   : create<ImplicitProducer>(this));
+  }
+
+  ProducerBase *add_producer(ProducerBase *producer) {
+    // Handle failed memory allocation
+    if (producer == nullptr) {
+      return nullptr;
+    }
+
+    producerCount.fetch_add(1, std::memory_order_relaxed);
+
+    // Add it to the lock-free list
+    auto prevTail = producerListTail.load(std::memory_order_relaxed);
+    do {
+      producer->next = prevTail;
+    } while (!producerListTail.compare_exchange_weak(
+        prevTail, producer, std::memory_order_release,
+        std::memory_order_relaxed));
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+    if (producer->isExplicit) {
+      auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+      do {
+        static_cast<ExplicitProducer *>(producer)->nextExplicitProducer =
+            prevTailExplicit;
+      } while (!explicitProducers.compare_exchange_weak(
+          prevTailExplicit, static_cast<ExplicitProducer *>(producer),
+          std::memory_order_release, std::memory_order_relaxed));
+    } else {
+      auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+      do {
+        static_cast<ImplicitProducer *>(producer)->nextImplicitProducer =
+            prevTailImplicit;
+      } while (!implicitProducers.compare_exchange_weak(
+          prevTailImplicit, static_cast<ImplicitProducer *>(producer),
+          std::memory_order_release, std::memory_order_relaxed));
+    }
+#endif
+
+    return producer;
+  }
+
+  void reown_producers() {
+    // After another instance is moved-into/swapped-with this one, all the
+    // producers we stole still think their parents are the other queue.
+    // So fix them up!
+    for (auto ptr = producerListTail.load(std::memory_order_relaxed);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      ptr->parent = this;
+    }
+  }
+
+
+  //////////////////////////////////
+  // Implicit producer hash
+  //////////////////////////////////
+
+  struct ImplicitProducerKVP {
+    std::atomic<details::thread_id_t> key;
+    ImplicitProducer *value;  // No need for atomicity since it's only read by
+                              // the thread that sets it in the first place
+
+    ImplicitProducerKVP() : value(nullptr) {}
+
+    ImplicitProducerKVP(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT {
+      key.store(other.key.load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+      value = other.value;
+    }
+
+    inline ImplicitProducerKVP &operator=(ImplicitProducerKVP &&other)
+        MOODYCAMEL_NOEXCEPT {
+      swap(other);
+      return *this;
+    }
+
+    inline void swap(ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT {
+      if (this != &other) {
+        details::swap_relaxed(key, other.key);
+        std::swap(value, other.value);
+      }
+    }
+  };
+
+  template <typename XT, typename XTraits>
+  friend void moodycamel::swap(
+      typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP &,
+      typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP &)
+      MOODYCAMEL_NOEXCEPT;
+
+  struct ImplicitProducerHash {
+    size_t capacity;
+    ImplicitProducerKVP *entries;
+    ImplicitProducerHash *prev;
+  };
+
+  inline void populate_initial_implicit_producer_hash() {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+      return;
+    }
+    else {
+      implicitProducerHashCount.store(0, std::memory_order_relaxed);
+      auto hash = &initialImplicitProducerHash;
+      hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+      hash->entries = &initialImplicitProducerHashEntries[0];
+      for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+        initialImplicitProducerHashEntries[i].key.store(
+            details::invalid_thread_id, std::memory_order_relaxed);
+      }
+      hash->prev = nullptr;
+      implicitProducerHash.store(hash, std::memory_order_relaxed);
+    }
+  }
+
+  void swap_implicit_producer_hashes(ConcurrentQueue &other) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+      return;
+    }
+    else {
+      // Swap (assumes our implicit producer hash is initialized)
+      initialImplicitProducerHashEntries.swap(
+          other.initialImplicitProducerHashEntries);
+      initialImplicitProducerHash.entries =
+          &initialImplicitProducerHashEntries[0];
+      other.initialImplicitProducerHash.entries =
+          &other.initialImplicitProducerHashEntries[0];
+
+      details::swap_relaxed(implicitProducerHashCount,
+                            other.implicitProducerHashCount);
+
+      details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+      if (implicitProducerHash.load(std::memory_order_relaxed) ==
+          &other.initialImplicitProducerHash) {
+        implicitProducerHash.store(&initialImplicitProducerHash,
+                                   std::memory_order_relaxed);
+      } else {
+        ImplicitProducerHash *hash;
+        for (hash = implicitProducerHash.load(std::memory_order_relaxed);
+             hash->prev != &other.initialImplicitProducerHash;
+             hash = hash->prev) {
+          continue;
+        }
+        hash->prev = &initialImplicitProducerHash;
+      }
+      if (other.implicitProducerHash.load(std::memory_order_relaxed) ==
+          &initialImplicitProducerHash) {
+        other.implicitProducerHash.store(&other.initialImplicitProducerHash,
+                                         std::memory_order_relaxed);
+      } else {
+        ImplicitProducerHash *hash;
+        for (hash = other.implicitProducerHash.load(std::memory_order_relaxed);
+             hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+          continue;
+        }
+        hash->prev = &other.initialImplicitProducerHash;
+      }
+    }
+  }
+
+  // Only fails (returns nullptr) if memory allocation fails
+  ImplicitProducer *get_or_add_implicit_producer() {
+    // Note that since the data is essentially thread-local (key is thread ID),
+    // there's a reduced need for fences (memory ordering is already consistent
+    // for any individual thread), except for the current table itself.
+
+    // Start by looking for the thread ID in the current and all previous hash
+    // tables. If it's not found, it must not be in there yet, since this same
+    // thread would have added it previously to one of the tables that we
+    // traversed.
+
+    // Code and algorithm adapted from
+    // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+
+    auto id = details::thread_id();
+    auto hashedId = details::hash_thread_id(id);
+
+    auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+    assert(
+        mainHash !=
+        nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+    for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+      // Look for the id in this hash
+      auto index = hashedId;
+      while (true) {  // Not an infinite loop because at least one slot is free
+                      // in the hash table
+        index &= hash->capacity - 1u;
+
+        auto probedKey =
+            hash->entries[index].key.load(std::memory_order_relaxed);
+        if (probedKey == id) {
+          // Found it! If we had to search several hashes deep, though, we
+          // should lazily add it to the current main hash table to avoid the
+          // extended search next time. Note there's guaranteed to be room in
+          // the current hash table since every subsequent table implicitly
+          // reserves space for all previous tables (there's only one
+          // implicitProducerHashCount).
+          auto value = hash->entries[index].value;
+          if (hash != mainHash) {
+            index = hashedId;
+            while (true) {
+              index &= mainHash->capacity - 1u;
+              auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+              auto reusable = details::invalid_thread_id2;
+              if (mainHash->entries[index].key.compare_exchange_strong(
+                      empty, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed) ||
+                  mainHash->entries[index].key.compare_exchange_strong(
+                      reusable, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed)) {
+#else
+              if (mainHash->entries[index].key.compare_exchange_strong(
+                      empty, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed)) {
+#endif
+                mainHash->entries[index].value = value;
+                break;
+              }
+              ++index;
+            }
+          }
+
+          return value;
+        }
+        if (probedKey == details::invalid_thread_id) {
+          break;  // Not in this hash table
+        }
+        ++index;
+      }
+    }
+
+    // Insert!
+    auto newCount =
+        1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+    while (true) {
+      // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+      if (newCount >= (mainHash->capacity >> 1) &&
+          !implicitProducerHashResizeInProgress.test_and_set(
+              std::memory_order_acquire)) {
+        // We've acquired the resize lock, try to allocate a bigger hash table.
+        // Note the acquire fence synchronizes with the release fence at the end
+        // of this block, and hence when we reload implicitProducerHash it must
+        // be the most recent version (it only gets changed within this locked
+        // block).
+        mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        if (newCount >= (mainHash->capacity >> 1)) {
+          size_t newCapacity = mainHash->capacity << 1;
+          while (newCount >= (newCapacity >> 1)) {
+            newCapacity <<= 1;
+          }
+          auto raw = static_cast<char *>(
+              (Traits::malloc)(sizeof(ImplicitProducerHash) +
+                               std::alignment_of<ImplicitProducerKVP>::value -
+                               1 + sizeof(ImplicitProducerKVP) * newCapacity));
+          if (raw == nullptr) {
+            // Allocation failed
+            implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+            implicitProducerHashResizeInProgress.clear(
+                std::memory_order_relaxed);
+            return nullptr;
+          }
+
+          auto newHash = new (raw) ImplicitProducerHash;
+          newHash->capacity = static_cast<size_t>(newCapacity);
+          newHash->entries = reinterpret_cast<ImplicitProducerKVP *>(
+              details::align_for<ImplicitProducerKVP>(
+                  raw + sizeof(ImplicitProducerHash)));
+          for (size_t i = 0; i != newCapacity; ++i) {
+            new (newHash->entries + i) ImplicitProducerKVP;
+            newHash->entries[i].key.store(details::invalid_thread_id,
+                                          std::memory_order_relaxed);
+          }
+          newHash->prev = mainHash;
+          implicitProducerHash.store(newHash, std::memory_order_release);
+          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+          mainHash = newHash;
+        } else {
+          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+        }
+      }
+
+      // If it's < three-quarters full, add to the old one anyway so that we
+      // don't have to wait for the next table to finish being allocated by
+      // another thread (and if we just finished allocating above, the condition
+      // will always be true)
+      if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+        auto producer =
+            static_cast<ImplicitProducer *>(recycle_or_create_producer(false));
+        if (producer == nullptr) {
+          implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+          return nullptr;
+        }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+        producer->threadExitListener.callback =
+            &ConcurrentQueue::implicit_producer_thread_exited_callback;
+        producer->threadExitListener.userData = producer;
+        details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+
+        auto index = hashedId;
+        while (true) {
+          index &= mainHash->capacity - 1u;
+          auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+          auto reusable = details::invalid_thread_id2;
+          if (mainHash->entries[index].key.compare_exchange_strong(
+                  reusable, id, std::memory_order_seq_cst,
+                  std::memory_order_relaxed)) {
+            implicitProducerHashCount.fetch_sub(
+                1,
+                std::memory_order_relaxed);  // already counted as a used slot
+            mainHash->entries[index].value = producer;
+            break;
+          }
+#endif
+          if (mainHash->entries[index].key.compare_exchange_strong(
+                  empty, id, std::memory_order_seq_cst,
+                  std::memory_order_relaxed)) {
+            mainHash->entries[index].value = producer;
+            break;
+          }
+          ++index;
+        }
+        return producer;
+      }
+
+      // Hmm, the old hash is quite full and somebody else is busy allocating a
+      // new one. We need to wait for the allocating thread to finish (if it
+      // succeeds, we add, if not, we try to allocate ourselves).
+      mainHash = implicitProducerHash.load(std::memory_order_acquire);
+    }
+  }
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+  void implicit_producer_thread_exited(ImplicitProducer *producer) {
+    // Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+    auto hash = implicitProducerHash.load(std::memory_order_acquire);
+    assert(hash != nullptr);  // The thread exit listener is only registered if
+                              // we were added to a hash in the first place
+    auto id = details::thread_id();
+    auto hashedId = details::hash_thread_id(id);
+    details::thread_id_t probedKey;
+
+    // We need to traverse all the hashes just in case other threads aren't on
+    // the current one yet and are trying to add an entry thinking there's a
+    // free slot (because they reused a producer)
+    for (; hash != nullptr; hash = hash->prev) {
+      auto index = hashedId;
+      do {
+        index &= hash->capacity - 1u;
+        probedKey = id;
+        if (hash->entries[index].key.compare_exchange_strong(
+                probedKey, details::invalid_thread_id2,
+                std::memory_order_seq_cst, std::memory_order_relaxed)) {
+          break;
+        }
+        ++index;
+      } while (
+          probedKey !=
+          details::invalid_thread_id);  // Can happen if the hash has changed
+                                        // but we weren't put back in it yet, or
+                                        // if we weren't added to this hash in
+                                        // the first place
+    }
+
+    // Mark the queue as being recyclable
+    producer->inactive.store(true, std::memory_order_release);
+  }
+
+  static void implicit_producer_thread_exited_callback(void *userData) {
+    auto producer = static_cast<ImplicitProducer *>(userData);
+    auto queue = producer->parent;
+    queue->implicit_producer_thread_exited(producer);
+  }
+#endif
+
+  //////////////////////////////////
+  // Utility functions
+  //////////////////////////////////
+
+  template <typename TAlign>
+  static inline void *aligned_malloc(size_t size) {
+    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
+                            std::alignment_of<details::max_align_t>::value)
+    return (Traits::malloc)(size);
+    else {
+      size_t alignment = std::alignment_of<TAlign>::value;
+      void *raw = (Traits::malloc)(size + alignment - 1 + sizeof(void *));
+      if (!raw) return nullptr;
+      char *ptr = details::align_for<TAlign>(reinterpret_cast<char *>(raw) +
+                                             sizeof(void *));
+      *(reinterpret_cast<void **>(ptr) - 1) = raw;
+      return ptr;
+    }
+  }
+
+  template <typename TAlign>
+  static inline void aligned_free(void *ptr) {
+    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
+                            std::alignment_of<details::max_align_t>::value)
+    return (Traits::free)(ptr);
+    else(Traits::free)(ptr ? *(reinterpret_cast<void **>(ptr) - 1) : nullptr);
+  }
+
+  template <typename U>
+  static inline U *create_array(size_t count) {
+    assert(count > 0);
+    U *p = static_cast<U *>(aligned_malloc<U>(sizeof(U) * count));
+    if (p == nullptr) return nullptr;
+
+    for (size_t i = 0; i != count; ++i) new (p + i) U();
+    return p;
+  }
+
+  template <typename U>
+  static inline void destroy_array(U *p, size_t count) {
+    if (p != nullptr) {
+      assert(count > 0);
+      for (size_t i = count; i != 0;) (p + --i)->~U();
+    }
+    aligned_free<U>(p);
+  }
+
+  template <typename U>
+  static inline U *create() {
+    void *p = aligned_malloc<U>(sizeof(U));
+    return p != nullptr ? new (p) U : nullptr;
+  }
+
+  template <typename U, typename A1>
+  static inline U *create(A1 &&a1) {
+    void *p = aligned_malloc<U>(sizeof(U));
+    return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+  }
+
+  template <typename U>
+  static inline void destroy(U *p) {
+    if (p != nullptr) p->~U();
+    aligned_free<U>(p);
+  }
+
+ private:
+  std::atomic<ProducerBase *> producerListTail;
+  std::atomic<std::uint32_t> producerCount;
+
+  std::atomic<size_t> initialBlockPoolIndex;
+  Block *initialBlockPool;
+  size_t initialBlockPoolSize;
+
+#ifndef MCDBGQ_USEDEBUGFREELIST
+  FreeList<Block> freeList;
+#else
+  debug::DebugFreeList<Block> freeList;
+#endif
+
+  std::atomic<ImplicitProducerHash *> implicitProducerHash;
+  std::atomic<size_t>
+      implicitProducerHashCount;  // Number of slots logically used
+  ImplicitProducerHash initialImplicitProducerHash;
+  std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE>
+      initialImplicitProducerHashEntries;
+  std::atomic_flag implicitProducerHashResizeInProgress;
+
+  std::atomic<std::uint32_t> nextExplicitConsumerId;
+  std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+  debug::DebugMutex implicitProdMutex;
+#endif
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+  std::atomic<ExplicitProducer *> explicitProducers;
+  std::atomic<ImplicitProducer *> implicitProducers;
+#endif
+};
+
+
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits> &queue)
+    : producer(queue.recycle_or_create_producer(true)) {
+  if (producer != nullptr) {
+    producer->token = this;
+  }
+}
+
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits> &queue)
+    : producer(reinterpret_cast<ConcurrentQueue<T, Traits> *>(&queue)
+                   ->recycle_or_create_producer(true)) {
+  if (producer != nullptr) {
+    producer->token = this;
+  }
+}
+
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits> &queue)
+    : itemsConsumedFromCurrent(0),
+      currentProducer(nullptr),
+      desiredProducer(nullptr) {
+  initialOffset =
+      queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits> &queue)
+    : itemsConsumedFromCurrent(0),
+      currentProducer(nullptr),
+      desiredProducer(nullptr) {
+  initialOffset =
+      reinterpret_cast<ConcurrentQueue<T, Traits> *>(&queue)
+          ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template <typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits> &a,
+                 ConcurrentQueue<T, Traits> &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+inline void swap(ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+inline void swap(ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &b)
+    MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
+}
+
+}  // namespace moodycamel
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
 #pragma warning(pop)

From 55e6f1b5bc3c33e1eefe0a0ef7f2662925f1257b Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Tue, 10 Feb 2026 11:29:48 +0800
Subject: [PATCH 06/28] clang format

---
 src/core/algorithm/hnsw/hnsw_entity.h           |  3 +--
 .../zvec/ailego/buffer/concurrentqueue.h        |  2 +-
 src/include/zvec/core/framework/index_storage.h | 17 +++++++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/core/algorithm/hnsw/hnsw_entity.h b/src/core/algorithm/hnsw/hnsw_entity.h
index d2c06c41..70ea3dcc 100644
--- a/src/core/algorithm/hnsw/hnsw_entity.h
+++ b/src/core/algorithm/hnsw/hnsw_entity.h
@@ -147,8 +147,7 @@ struct Neighbors {
   Neighbors(uint32_t cnt_in, const node_id_t *data_in)
       : cnt{cnt_in}, data{data_in} {}
 
-  Neighbors(IndexStorage::MemoryBlock &mem_block)
-      : neighbor_block{mem_block} {
+  Neighbors(IndexStorage::MemoryBlock &mem_block) : neighbor_block{mem_block} {
     auto hd = reinterpret_cast<const NeighborsHeader *>(neighbor_block.data());
     cnt = hd->neighbor_cnt;
     data = hd->neighbors;
diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
index 90edaf97..3b587642 100644
--- a/src/include/zvec/ailego/buffer/concurrentqueue.h
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -1706,7 +1706,7 @@ class ConcurrentQueue {
   // contention.
   template <typename N>  // N must inherit FreeListNode or have the same fields
                          // (and initialization of them)
-                         struct FreeList {
+  struct FreeList {
     FreeList() : freeListHead(nullptr) {}
     FreeList(FreeList &&other)
         : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) {
diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h
index 346b8da4..920580fe 100644
--- a/src/include/zvec/core/framework/index_storage.h
+++ b/src/include/zvec/core/framework/index_storage.h
@@ -37,7 +37,8 @@ class IndexStorage : public IndexModule {
     };
 
     MemoryBlock() {}
-    MemoryBlock(ailego::VecBufferPoolHandle* buffer_pool_handle, int block_id, void *data)
+    MemoryBlock(ailego::VecBufferPoolHandle *buffer_pool_handle, int block_id,
+                void *data)
         : type_(MemoryBlockType::MBT_BUFFERPOOL) {
       buffer_pool_handle_ = buffer_pool_handle;
       buffer_block_id_ = block_id;
@@ -65,7 +66,8 @@ class IndexStorage : public IndexModule {
           this->reset(std::move(rhs.data_));
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
-          this->reset(std::move(rhs.buffer_pool_handle_), std::move(rhs.buffer_block_id_), std::move(rhs.data_));
+          this->reset(std::move(rhs.buffer_pool_handle_),
+                      std::move(rhs.buffer_block_id_), std::move(rhs.data_));
           break;
         default:
           break;
@@ -79,7 +81,8 @@ class IndexStorage : public IndexModule {
             this->reset(rhs.data_);
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_);
+            this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_,
+                        rhs.data_);
             buffer_pool_handle_->acquire_one(buffer_block_id_);
             break;
           default:
@@ -96,7 +99,8 @@ class IndexStorage : public IndexModule {
             this->reset(std::move(rhs.data_));
             break;
           case MemoryBlockType::MBT_BUFFERPOOL:
-            this->reset(std::move(rhs.buffer_pool_handle_), std::move(rhs.buffer_block_id_), std::move(rhs.data_));
+            this->reset(std::move(rhs.buffer_pool_handle_),
+                        std::move(rhs.buffer_block_id_), std::move(rhs.data_));
             break;
           default:
             break;
@@ -124,7 +128,8 @@ class IndexStorage : public IndexModule {
       return data_;
     }
 
-    void reset(ailego::VecBufferPoolHandle* buffer_pool_handle, int block_id, void *data) {
+    void reset(ailego::VecBufferPoolHandle *buffer_pool_handle, int block_id,
+               void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
         buffer_pool_handle->release_one(buffer_block_id_);
       }
@@ -145,7 +150,7 @@ class IndexStorage : public IndexModule {
 
     MemoryBlockType type_{MBT_UNKNOWN};
     void *data_{nullptr};
-    mutable ailego::VecBufferPoolHandle* buffer_pool_handle_;
+    mutable ailego::VecBufferPoolHandle *buffer_pool_handle_;
     int buffer_block_id_{0};
   };
 

From b24d921053dd2597357c524ebb78d65979948a92 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Tue, 10 Feb 2026 11:45:03 +0800
Subject: [PATCH 07/28] clang format

---
 .../zvec/ailego/buffer/concurrentqueue.h      | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
index 3b587642..16f297e8 100644
--- a/src/include/zvec/ailego/buffer/concurrentqueue.h
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -709,7 +709,7 @@ struct nomove_if<false> {
 };
 
 template <typename It>
-static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
+static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT->decltype(*it) {
   return *it;
 }
 
@@ -2833,10 +2833,9 @@ class ConcurrentQueue {
 
       // Create the new block
       pr_blockIndexSize <<= 1;
-      auto newRawPtr = static_cast<char *>(
-          (Traits::malloc)(sizeof(BlockIndexHeader) +
-                           std::alignment_of<BlockIndexEntry>::value - 1 +
-                           sizeof(BlockIndexEntry) * pr_blockIndexSize));
+      auto newRawPtr = static_cast<char *>((Traits::malloc)(
+          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
+          1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
       if (newRawPtr == nullptr) {
         pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
         return false;
@@ -3556,12 +3555,11 @@ class ConcurrentQueue {
       auto prev = blockIndex.load(std::memory_order_relaxed);
       size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
       auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
-      auto raw = static_cast<char *>(
-          (Traits::malloc)(sizeof(BlockIndexHeader) +
-                           std::alignment_of<BlockIndexEntry>::value - 1 +
-                           sizeof(BlockIndexEntry) * entryCount +
-                           std::alignment_of<BlockIndexEntry *>::value - 1 +
-                           sizeof(BlockIndexEntry *) * nextBlockIndexCapacity));
+      auto raw = static_cast<char *>((Traits::malloc)(
+          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
+          1 + sizeof(BlockIndexEntry) * entryCount +
+          std::alignment_of<BlockIndexEntry *>::value - 1 +
+          sizeof(BlockIndexEntry *) * nextBlockIndexCapacity));
       if (raw == nullptr) {
         return false;
       }

From 8916f90025d24f0bada96ac3c3eac9aa1d9efd7a Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Tue, 10 Feb 2026 15:53:54 +0800
Subject: [PATCH 08/28] clang format

---
 src/include/zvec/ailego/buffer/concurrentqueue.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/include/zvec/ailego/buffer/concurrentqueue.h b/src/include/zvec/ailego/buffer/concurrentqueue.h
index 16f297e8..f7f3d77e 100644
--- a/src/include/zvec/ailego/buffer/concurrentqueue.h
+++ b/src/include/zvec/ailego/buffer/concurrentqueue.h
@@ -111,8 +111,8 @@ static inline thread_id_t thread_id() {
 #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
 // No sense pulling in windows.h in a header, we'll manually declare the
 // function we use and rely on backwards-compatibility for this not to break
-extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(
-    void);
+extern "C"
+    __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
 namespace moodycamel {
 namespace details {
 static_assert(sizeof(unsigned long) == sizeof(std::uint32_t),
@@ -709,7 +709,7 @@ struct nomove_if<false> {
 };
 
 template <typename It>
-static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT->decltype(*it) {
+static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
   return *it;
 }
 

From e3d014ca629bdc7beda6fed6e32d95c0428470e3 Mon Sep 17 00:00:00 2001
From: Zefeng Yin <yinzefeng.yzf@alibaba-inc.com>
Date: Tue, 10 Feb 2026 22:41:21 +0800
Subject: [PATCH 09/28] fix bugs

---
 src/ailego/buffer/buffer_pool.cc               | 15 ++++++++-------
 .../algorithm/flat/flat_streamer_context.h     | 10 +++++++++-
 src/core/algorithm/hnsw/hnsw_context.h         |  4 ++++
 src/core/interface/index.cc                    |  7 +++++--
 src/core/utility/buffer_storage.cc             | 18 ++++++------------
 src/include/zvec/ailego/buffer/buffer_pool.h   |  5 +++--
 .../zvec/core/framework/index_storage.h        |  2 +-
 .../index/column/vector_column_indexer_test.cc |  1 -
 8 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/src/ailego/buffer/buffer_pool.cc b/src/ailego/buffer/buffer_pool.cc
index 81ed92bf..bdbf0a03 100644
--- a/src/ailego/buffer/buffer_pool.cc
+++ b/src/ailego/buffer/buffer_pool.cc
@@ -66,7 +66,7 @@ void LPMap::init(size_t entry_num) {
     entries_[i].load_count.store(0);
     entries_[i].buffer = nullptr;
   }
-  cache_.init(entry_num);
+  cache_.init(entry_num * 4);
 }
 
 char *LPMap::acquire_block(block_id_t block_id) {
@@ -136,9 +136,7 @@ void LPMap::recycle(moodycamel::ConcurrentQueue<char *> &free_buffers) {
   }
 }
 
-VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
-                             size_t block_size)
-    : pool_capacity_(pool_capacity) {
+VecBufferPool::VecBufferPool(const std::string &filename) {
   fd_ = open(filename.c_str(), O_RDONLY);
   if (fd_ < 0) {
     throw std::runtime_error("Failed to open file: " + filename);
@@ -148,9 +146,12 @@ VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
     throw std::runtime_error("Failed to stat file: " + filename);
   }
   file_size_ = st.st_size;
+}
 
-  size_t buffer_num = pool_capacity_ / block_size;
-  size_t block_num = file_size_ / block_size + 500;
+int VecBufferPool::init(size_t pool_capacity, size_t block_size) {
+  pool_capacity_ = pool_capacity;
+  size_t buffer_num = pool_capacity_ / block_size + 10;
+  size_t block_num = file_size_ / block_size + 10;
   lp_map_.init(block_num);
   for (size_t i = 0; i < buffer_num; i++) {
     char *buffer = (char *)aligned_alloc(64, block_size);
@@ -160,6 +161,7 @@ VecBufferPool::VecBufferPool(const std::string &filename, size_t pool_capacity,
   }
   LOG_DEBUG("Buffer pool num: %zu, entry num: %zu", buffer_num,
             lp_map_.entry_num());
+  return 0;
 }
 
 VecBufferPoolHandle VecBufferPool::get_handle() {
@@ -209,7 +211,6 @@ char *VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset,
 int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
   ssize_t read_bytes = pread(fd_, buffer, length, offset);
   if (read_bytes != static_cast<ssize_t>(length)) {
-    LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
     LOG_ERROR("Buffer pool failed to read file at offset: %zu", offset);
     return -1;
   }
diff --git a/src/core/algorithm/flat/flat_streamer_context.h b/src/core/algorithm/flat/flat_streamer_context.h
index 24cfd9e5..22a1106a 100644
--- a/src/core/algorithm/flat/flat_streamer_context.h
+++ b/src/core/algorithm/flat/flat_streamer_context.h
@@ -190,10 +190,18 @@ class FlatStreamerContext : public IndexStreamer::Context {
     group_topk_heaps_.clear();
   }
 
-  void reset() override {}
+  void reset() override {
+    for (auto &it : results_) {
+      it.clear();
+    }
+    for (auto &it : group_results_) {
+      it.clear();
+    }
+  }
 
   //! Reset the context
   void reset(const FlatStreamer<BATCH_SIZE> *owner) {
+    this->reset();
     magic_ = owner->magic();
     feature_size_ = owner->meta().element_size();
 
diff --git a/src/core/algorithm/hnsw/hnsw_context.h b/src/core/algorithm/hnsw/hnsw_context.h
index 22bcfaad..e776b81a 100644
--- a/src/core/algorithm/hnsw/hnsw_context.h
+++ b/src/core/algorithm/hnsw/hnsw_context.h
@@ -335,6 +335,7 @@ class HnswContext : public IndexContext {
 
   //! Reset context
   void reset(void) override {
+    this->clear();
     set_filter(nullptr);
     reset_threshold();
     set_fetch_vector(false);
@@ -422,6 +423,9 @@ class HnswContext : public IndexContext {
     for (auto &it : results_) {
       it.clear();
     }
+    for (auto &it : group_results_) {
+      it.clear();
+    }
   }
 
   uint32_t *mutable_stats_get_neighbors() {
diff --git a/src/core/interface/index.cc b/src/core/interface/index.cc
index 038f67d4..72005bc9 100644
--- a/src/core/interface/index.cc
+++ b/src/core/interface/index.cc
@@ -406,8 +406,9 @@ int Index::Search(const VectorData &vector_data,
   }
 
   // dense support refiner, but sparse doesn't
+  int ret = 0;
   if (search_param->refiner_param == nullptr) {
-    return _dense_search(vector_data, search_param, result, context);
+    ret = _dense_search(vector_data, search_param, result, context);
   } else {
     auto &reference_index = search_param->refiner_param->reference_index;
     if (reference_index == nullptr) {
@@ -441,8 +442,10 @@ int Index::Search(const VectorData &vector_data,
     // TODO: should copy other params?
     flat_search_param->bf_pks = std::make_shared<std::vector<uint64_t>>(keys);
 
-    return reference_index->Search(vector_data, flat_search_param, result);
+    ret = reference_index->Search(vector_data, flat_search_param, result);
   }
+  context->reset();
+  return ret;
 }
 
 
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index dcdb13d3..1fccbe2e 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -98,7 +98,7 @@ class BufferStorage : public IndexStorage {
       }
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
-                             segment_->meta()->data_index + offset;
+                             segment_->meta()->data_index;
       *data =
           owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset;
       return len;
@@ -114,7 +114,7 @@ class BufferStorage : public IndexStorage {
       }
       size_t buffer_offset = segment_header_start_offset_ +
                              segment_header_->content_offset +
-                             segment_->meta()->data_index + offset;
+                             segment_->meta()->data_index;
       data.reset(
           owner_->buffer_pool_handle_.get(), segment_id_,
           owner_->get_buffer(buffer_offset, capacity_, segment_id_) + offset);
@@ -177,21 +177,15 @@ class BufferStorage : public IndexStorage {
 
   //! Open storage
   int open(const std::string &path, bool /*create*/) override {
-    LOG_INFO("open buffer storage 1");
     file_name_ = path;
-    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(
-        path, 20lu * 1024 * 1024 * 1024, 2490368 * 2);
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path);
     buffer_pool_handle_ = std::make_shared<ailego::VecBufferPoolHandle>(
         buffer_pool_->get_handle());
     int ret = ParseToMapping();
-    LOG_ERROR("segment count: %lu, max_segment_size: %lu", segments_.size(),
-              max_segment_size_);
-    for (auto iter = segments_.begin(); iter != segments_.end(); iter++) {
-      auto seg = this->get(iter->first, 0);
-      MemoryBlock block;
-      int len = seg->read(0, block, 1);
-      LOG_ERROR("segment %s: %d", iter->first.c_str(), len);
+    if (ret != 0) {
+      return ret;
     }
+    ret = buffer_pool_->init(20lu * 1024 * 1024 * 1024, max_segment_size_);
     if (ret != 0) {
       return ret;
     }
diff --git a/src/include/zvec/ailego/buffer/buffer_pool.h b/src/include/zvec/ailego/buffer/buffer_pool.h
index f1a0149c..c27065a2 100644
--- a/src/include/zvec/ailego/buffer/buffer_pool.h
+++ b/src/include/zvec/ailego/buffer/buffer_pool.h
@@ -97,12 +97,13 @@ class VecBufferPool {
  public:
   typedef std::shared_ptr<VecBufferPool> Pointer;
 
-  VecBufferPool(const std::string &filename, size_t pool_capacity,
-                size_t block_size);
+  VecBufferPool(const std::string &filename);
   ~VecBufferPool() {
     close(fd_);
   }
 
+  int init(size_t pool_capacity, size_t block_size);
+
   VecBufferPoolHandle get_handle();
 
   char *acquire_buffer(block_id_t block_id, size_t offset, size_t size,
diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h
index 920580fe..9173da3e 100644
--- a/src/include/zvec/core/framework/index_storage.h
+++ b/src/include/zvec/core/framework/index_storage.h
@@ -131,7 +131,7 @@ class IndexStorage : public IndexModule {
     void reset(ailego::VecBufferPoolHandle *buffer_pool_handle, int block_id,
                void *data) {
       if (type_ == MemoryBlockType::MBT_BUFFERPOOL) {
-        buffer_pool_handle->release_one(buffer_block_id_);
+        buffer_pool_handle_->release_one(buffer_block_id_);
       }
       type_ = MemoryBlockType::MBT_BUFFERPOOL;
       buffer_pool_handle_ = buffer_pool_handle;
diff --git a/tests/db/index/column/vector_column_indexer_test.cc b/tests/db/index/column/vector_column_indexer_test.cc
index 483efcde..251e5a18 100644
--- a/tests/db/index/column/vector_column_indexer_test.cc
+++ b/tests/db/index/column/vector_column_indexer_test.cc
@@ -2160,7 +2160,6 @@ TEST(VectorColumnIndexerTest, Failure) {
     ASSERT_TRUE(indexer->Flush().ok());
     ASSERT_TRUE(indexer->Close().ok());
     {
-      ailego::BufferManager::Instance().init(10 * 1024 * 1024, 1);
       auto indexer = std::make_shared<VectorColumnIndexer>(
           index_file_path,
           FieldSchema("test", DataType::VECTOR_FP32, 3, false,

From ed6a3f205e9aaec904316cf5df318dbc073207d3 Mon Sep 17 00:00:00 2001
From: Zefeng Yin <yinzefeng.yzf@alibaba-inc.com>
Date: Wed, 11 Feb 2026 10:58:59 +0800
Subject: [PATCH 10/28] =?UTF-8?q?fix=20complie=E2=80=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/core/utility/buffer_storage.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index 1fccbe2e..4db38cb0 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <mutex>
-// #include <zvec/ailego/buffer/buffer_manager.h>
+#include <algorithm>
 #include <zvec/ailego/buffer/buffer_pool.h>
 #include <zvec/ailego/utility/time_helper.h>
 #include <zvec/core/framework/index_error.h>
@@ -476,7 +476,7 @@ class BufferStorage : public IndexStorage {
   IndexFormat::MetaFooter footer_;
   std::map<std::string, IndexMapping::SegmentInfo> segments_{};
   std::map<std::string, size_t> id_hash_{};
-  size_t max_segment_size_{0};
+  uint64_t max_segment_size_{0};
   std::unique_ptr<char[]> segment_buffer_{nullptr};
 
   ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
@@ -487,4 +487,4 @@ class BufferStorage : public IndexStorage {
 INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
 
 }  // namespace core
-}  // namespace zvec
\ No newline at end of file
+}  // namespace zvec

From 95b1c16dafcd417e15f04d5bda276cc1d8431774 Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Wed, 11 Feb 2026 11:31:05 +0800
Subject: [PATCH 11/28] clang format

---
 src/core/utility/buffer_storage.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index 4db38cb0..d339553a 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <mutex>
 #include <algorithm>
+#include <mutex>
 #include <zvec/ailego/buffer/buffer_pool.h>
 #include <zvec/ailego/utility/time_helper.h>
 #include <zvec/core/framework/index_error.h>

From d6db41d5da5e9fc6fd46aa62728da4d55b52c08e Mon Sep 17 00:00:00 2001
From: "yinzefeng.yzf" <yinzefeng.yzf@alibaba-inc.com>
Date: Wed, 11 Feb 2026 11:56:00 +0800
Subject: [PATCH 12/28] fix ut

---
 tests/core/metric/quantized_integer_metric_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/core/metric/quantized_integer_metric_test.cc b/tests/core/metric/quantized_integer_metric_test.cc
index d0deac84..62a5a3b8 100644
--- a/tests/core/metric/quantized_integer_metric_test.cc
+++ b/tests/core/metric/quantized_integer_metric_test.cc
@@ -516,7 +516,7 @@ void TestDistanceMatrixInt4(const std::string &metric_name) {
   matrix_compute(&matrix2[0], &query2[0], meta2.dimension(), &result2[0]);
 
   for (size_t i = 0; i < batch_size * query_size; ++i) {
-    EXPECT_NEAR(result1[i], result2[i], 1e-4);
+    EXPECT_NEAR(result1[i], result2[i], 1e-3);
     EXPECT_TRUE(IsAlmostEqual(result1[i], result2[i], 1e4));
   }
 }

From 8dd8e480236a0585292e7b1b7f75043eb4551cb3 Mon Sep 17 00:00:00 2001
From: lichen2015 <lc285652@alibaba-inc.com>
Date: Thu, 12 Feb 2026 10:38:05 +0800
Subject: [PATCH 13/28] fix: combined indexer should use key instead of index
 (#87)

Co-authored-by: yinzefeng.yzf <yinzefeng.yzf@alibaba-inc.com>
---
 src/core/algorithm/flat/flat_streamer.cc      |  6 +-
 .../algorithm/flat/flat_streamer_context.h    |  2 +-
 .../combined_vector_column_indexer.cc         | 88 +++++++++++++------
 .../combined_vector_column_indexer.h          |  1 +
 .../core/algorithm/flat/flat_streamer_test.cc |  4 +-
 tests/db/index/segment/segment_test.cc        | 60 +++++++++++++
 6 files changed, 128 insertions(+), 33 deletions(-)

diff --git a/src/core/algorithm/flat/flat_streamer.cc b/src/core/algorithm/flat/flat_streamer.cc
index a721cf5b..8969efc1 100644
--- a/src/core/algorithm/flat/flat_streamer.cc
+++ b/src/core/algorithm/flat/flat_streamer.cc
@@ -376,7 +376,7 @@ int FlatStreamer<BATCH_SIZE>::search_bf_by_p_keys_impl(
       if (!filter.is_valid() || !filter(key)) {
         dist_t dist = 0;
         IndexStorage::MemoryBlock block;
-        entity_.get_vector_by_key(key, block);
+        if (entity_.get_vector_by_key(key, block) != 0) continue;
         entity_.row_major_distance(query, block.data(), 1, &dist);
         heap->emplace(key, dist);
       }
@@ -418,7 +418,7 @@ int FlatStreamer<BATCH_SIZE>::group_by_search_impl(
       if (!bf_context->filter().is_valid() || !bf_context->filter()(key)) {
         dist_t dist = 0;
         IndexStorage::MemoryBlock block;
-        entity_.get_vector_by_key(key, block);
+        if (entity_.get_vector_by_key(key, block) != 0) continue;
         entity_.row_major_distance(query, block.data(), 1, &dist);
 
         std::string group_id = group_by(key);
@@ -466,7 +466,7 @@ int FlatStreamer<BATCH_SIZE>::group_by_search_p_keys_impl(
       if (!bf_context->filter().is_valid() || !bf_context->filter()(key)) {
         dist_t dist = 0;
         IndexStorage::MemoryBlock block;
-        entity_.get_vector_by_key(key, block);
+        if (entity_.get_vector_by_key(key, block) != 0) continue;
         entity_.row_major_distance(query, block.data(), 1, &dist);
 
         std::string group_id = group_by(key);
diff --git a/src/core/algorithm/flat/flat_streamer_context.h b/src/core/algorithm/flat/flat_streamer_context.h
index 22a1106a..42149cc6 100644
--- a/src/core/algorithm/flat/flat_streamer_context.h
+++ b/src/core/algorithm/flat/flat_streamer_context.h
@@ -122,7 +122,7 @@ class FlatStreamerContext : public IndexStreamer::Context {
         owner_->entity().get_vector_by_key(key, block);
         results_[idx].emplace_back(key, score, key, block);
       } else {
-        results_[idx].emplace_back(key, score);
+        results_[idx].emplace_back(key, score, key);
       }
     }
   }
diff --git a/src/db/index/column/vector_column/combined_vector_column_indexer.cc b/src/db/index/column/vector_column/combined_vector_column_indexer.cc
index f1385b01..70c71d07 100644
--- a/src/db/index/column/vector_column/combined_vector_column_indexer.cc
+++ b/src/db/index/column/vector_column/combined_vector_column_indexer.cc
@@ -40,22 +40,53 @@ CombinedVectorColumnIndexer::CombinedVectorColumnIndexer(
     }
   }
 
+  int block_offset = 0;
+  for (size_t i = 0; i < indexers_.size(); ++i) {
+    auto &block_meta = blocks_[i];
+    block_offsets_.push_back(block_offset);
+    block_offset += block_meta.doc_count_;
+  }
+
   min_doc_id_ = segment_meta.min_doc_id();
 }
 
-
 Result<IndexResults::Ptr> CombinedVectorColumnIndexer::Search(
     const vector_column_params::VectorData &vector_data,
     const vector_column_params::QueryParams &query_params) {
   core::IndexDocumentList doc_list;
   std::vector<std::string> reverted_vector_list;
   std::vector<std::string> reverted_sparse_values_list;
-  int block_offset = 0;
+
+  // query_params.bf_pks is segment level, here we need to convert it to block
+  // level
+  std::vector<std::vector<uint64_t>> block_bf_pks(indexers_.size());
+
+  if (!query_params.bf_pks.empty()) {
+    // dispatcher pks to corresponding block_bf_pks
+    for (auto &pk : query_params.bf_pks[0]) {
+      for (size_t i = 0; i < block_offsets_.size(); ++i) {
+        if (pk >= block_offsets_[i] &&
+            pk < block_offsets_[i] + blocks_[i].doc_count_) {
+          block_bf_pks[i].push_back(
+              static_cast<uint64_t>(pk - block_offsets_[i]));
+          break;
+        }
+      }
+    }
+  }
 
   auto q_params = query_params.query_params;
   for (size_t i = 0; i < indexers_.size(); ++i) {
-    auto &block_meta = blocks_[i];
+    if (!query_params.bf_pks.empty() && block_bf_pks[i].empty()) {
+      LOG_DEBUG(
+          "query_params has bf_pks, but block_bf_pks[%zu] is empty, just skip "
+          "this indexer",
+          i);
+      continue;
+    }
     zvec::Result<zvec::IndexResults::Ptr> result{nullptr};
+    float scale_factor{};
+    bool need_refine{false};
     if (q_params && q_params->is_using_refiner()) {
       if (normal_indexers_.size() != indexers_.size()) {
         return tl::make_unexpected(Status::InvalidArgument(
@@ -63,7 +94,6 @@ Result<IndexResults::Ptr> CombinedVectorColumnIndexer::Search(
             "] not match indexers size[", indexers_.size(), "]"));
       }
       // query_params of HNSW doesn't have scale_factor
-      float scale_factor{};
       if (q_params->type() == IndexType::FLAT) {
         scale_factor = std::dynamic_pointer_cast<FlatQueryParams>(q_params)
                            ->scale_factor();
@@ -71,29 +101,34 @@ Result<IndexResults::Ptr> CombinedVectorColumnIndexer::Search(
         scale_factor =
             std::dynamic_pointer_cast<IVFQueryParams>(q_params)->scale_factor();
       }
-      vector_column_params::QueryParams modified_query_params{
-          query_params.data_type,
-          query_params.dimension,
-          query_params.topk,
-          query_params.filter,
-          query_params.fetch_vector,
-          query_params.query_params,
-          query_params.group_by
-              ? std::make_unique<vector_column_params::GroupByParams>(
-                    query_params.group_by->group_topk,
-                    query_params.group_by->group_count,
-                    query_params.group_by->group_by)
-              : nullptr,
-          query_params.bf_pks,
-          std::shared_ptr<vector_column_params::RefinerParam>(
-              new vector_column_params::RefinerParam{scale_factor,
-                                                     normal_indexers_[i]}),
-          query_params.extra_params};
-      result = indexers_[i]->Search(vector_data, modified_query_params);
-    } else {
-      result = indexers_[i]->Search(vector_data, query_params);
+      need_refine = true;
     }
 
+    vector_column_params::QueryParams modified_query_params{
+        query_params.data_type,
+        query_params.dimension,
+        query_params.topk,
+        query_params.filter,
+        query_params.fetch_vector,
+        query_params.query_params,
+        query_params.group_by
+            ? std::make_unique<vector_column_params::GroupByParams>(
+                  query_params.group_by->group_topk,
+                  query_params.group_by->group_count,
+                  query_params.group_by->group_by)
+            : nullptr,
+        {},
+        need_refine ? std::shared_ptr<vector_column_params::RefinerParam>(
+                          new vector_column_params::RefinerParam{
+                              scale_factor, normal_indexers_[i]})
+                    : nullptr,
+        query_params.extra_params};
+
+    if (!query_params.bf_pks.empty()) {
+      modified_query_params.bf_pks.emplace_back(block_bf_pks[i]);
+    }
+
+    result = indexers_[i]->Search(vector_data, modified_query_params);
     if (!result) {
       return tl::make_unexpected(result.error());
     }
@@ -105,10 +140,9 @@ Result<IndexResults::Ptr> CombinedVectorColumnIndexer::Search(
     const auto &sub_docs = vector_index_results->docs();
     for (size_t j = 0; j < sub_docs.size(); ++j) {
       auto doc = sub_docs[j];
-      doc.set_index(block_offset + sub_docs[j].index());
+      doc.set_key(block_offsets_[i] + sub_docs[j].key());
       doc_list.emplace_back(std::move(doc));
     }
-    block_offset += block_meta.doc_count_;
 
     auto &&temp_vector_list = vector_index_results->reverted_vector_list();
     reverted_vector_list.insert(
diff --git a/src/db/index/column/vector_column/combined_vector_column_indexer.h b/src/db/index/column/vector_column/combined_vector_column_indexer.h
index 2e723c19..b0b0589f 100644
--- a/src/db/index/column/vector_column/combined_vector_column_indexer.h
+++ b/src/db/index/column/vector_column/combined_vector_column_indexer.h
@@ -52,6 +52,7 @@ class CombinedVectorColumnIndexer {
   std::vector<VectorColumnIndexer::Ptr> indexers_;
   std::vector<VectorColumnIndexer::Ptr> normal_indexers_;
   std::vector<BlockMeta> blocks_;
+  std::vector<uint32_t> block_offsets_;
   MetricType metric_type_{MetricType::UNDEFINED};
   bool is_quantized_{false};
   uint64_t min_doc_id_{0};
diff --git a/tests/core/algorithm/flat/flat_streamer_test.cc b/tests/core/algorithm/flat/flat_streamer_test.cc
index 022c1063..f03012d7 100644
--- a/tests/core/algorithm/flat/flat_streamer_test.cc
+++ b/tests/core/algorithm/flat/flat_streamer_test.cc
@@ -847,8 +847,8 @@ TEST_F(FlatStreamerTest, TestMaxIndexSize) {
       writeCnt1 * 128 * 4 + writeCnt1 * 8 + writeCnt1 * 28 / 32;
   LOG_INFO("increment1: %lu, expect_size: %lu", increment1, expect_size);
 
-  ASSERT_GT(expect_size, increment1 * 0.8f);
-  ASSERT_LT(expect_size, increment1 * 1.2f);
+  ASSERT_GT(expect_size, increment1 * 0.75f);
+  ASSERT_LT(expect_size, increment1 * 1.25f);
 
   streamer->flush(0UL);
   streamer.reset();
diff --git a/tests/db/index/segment/segment_test.cc b/tests/db/index/segment/segment_test.cc
index 5db3f0be..6ca6fffe 100644
--- a/tests/db/index/segment/segment_test.cc
+++ b/tests/db/index/segment/segment_test.cc
@@ -1170,6 +1170,66 @@ TEST_P(SegmentTest, CombinedVectorColumnIndexerWithQuantVectorIndex) {
   ASSERT_EQ(count, 10);
 }
 
+TEST_P(SegmentTest, CombinedVectorColumnIndexerQueryWithPks) {
+  options.max_buffer_size_ = 10 * 1024;
+
+  auto tmp_schema = test::TestHelper::CreateSchemaWithVectorIndex(
+      false, "demo", std::make_shared<HnswIndexParams>(MetricType::IP));
+
+  auto segment = test::TestHelper::CreateSegmentWithDoc(
+      col_path, *tmp_schema, 0, 0, id_map, delete_store, version_manager,
+      options, 0, 0);
+  ASSERT_TRUE(segment != nullptr);
+
+
+  uint64_t MAX_DOC = 1000;
+  test::TestHelper::SegmentInsertDoc(segment, *schema, 0, MAX_DOC);
+
+  auto combined_indexer = segment->get_combined_vector_indexer("dense_fp32");
+  ASSERT_TRUE(combined_indexer != nullptr);
+
+  Doc verify_doc = test::TestHelper::CreateDoc(999, *schema);
+  std::vector<std::vector<uint64_t>> bf_pks = {
+      {10, 20, 30, 40, 50, 60, 70, 80, 90, 999}};
+  // query
+  auto dense_fp32_field = schema->get_field("dense_fp32");
+  auto query_vector = verify_doc.get<std::vector<float>>("dense_fp32").value();
+  auto query = vector_column_params::VectorData{
+      vector_column_params::DenseVector{.data = query_vector.data()}};
+  auto query_params = vector_column_params::QueryParams{
+      .data_type = dense_fp32_field->data_type(),
+      .dimension = dense_fp32_field->dimension(),
+      .topk = 10,
+      .filter = nullptr,
+      .fetch_vector = false,
+      .query_params = std::make_shared<zvec::QueryParams>(IndexType::HNSW),
+      .group_by = nullptr,
+      .bf_pks = bf_pks,
+      .refiner_param = nullptr,
+      .extra_params = {}};
+
+  auto results = combined_indexer->Search(query, query_params);
+  ASSERT_TRUE(results.has_value());
+
+  auto vector_results =
+      dynamic_cast<VectorIndexResults *>(results.value().get());
+  ASSERT_TRUE(vector_results);
+  ASSERT_EQ(vector_results->count(), 10);
+
+  int count = 0;
+  std::vector<uint64_t> result_doc_ids;
+  auto iter = vector_results->create_iterator();
+  while (iter->valid()) {
+    count++;
+    result_doc_ids.push_back(iter->doc_id());
+    iter->next();
+  }
+  ASSERT_EQ(count, 10);
+  // need reverse result_doc_ids
+  std::reverse(result_doc_ids.begin(), result_doc_ids.end());
+  ASSERT_EQ(result_doc_ids, bf_pks[0]);
+}
+
 
 TEST_P(SegmentTest, ConcurrentInsertOperations) {
   auto segment = test::TestHelper::CreateSegmentWithDoc(

From 753cc0d6bd314ab1ec9e85fbab76fd4d424620ed Mon Sep 17 00:00:00 2001
From: Cuiys <cuiyushuai.cys@alibaba-inc.com>
Date: Thu, 12 Feb 2026 18:33:41 +0800
Subject: [PATCH 14/28] feat: support ai extension (#88)

---
 .github/workflows/linux_arm64_docker_ci.yml   |   11 +-
 .github/workflows/linux_x64_docker_ci.yml     |   11 +-
 pyproject.toml                                |   14 +
 python/tests/test_embedding.py                | 2026 ++++++++++++++++-
 python/tests/test_reranker.py                 |  934 +++++++-
 python/tests/test_util.py                     |    5 -
 python/zvec/__init__.py                       |   37 +-
 python/zvec/common/constants.py               |   12 +
 python/zvec/extension/__init__.py             |   29 +-
 .../zvec/extension/bm25_embedding_function.py |  375 +++
 python/zvec/extension/embedding.py            |  188 --
 python/zvec/extension/embedding_function.py   |  148 ++
 .../zvec/extension/multi_vector_reranker.py   |  174 ++
 .../extension/openai_embedding_function.py    |  238 ++
 python/zvec/extension/openai_function.py      |  149 ++
 .../zvec/extension/qwen_embedding_function.py |  537 +++++
 python/zvec/extension/qwen_function.py        |  186 ++
 python/zvec/extension/qwen_rerank_function.py |  162 ++
 python/zvec/extension/rerank.py               |  343 ---
 python/zvec/extension/rerank_function.py      |   69 +
 ...sentence_transformer_embedding_function.py |  839 +++++++
 .../sentence_transformer_function.py          |  150 ++
 .../sentence_transformer_rerank_function.py   |  384 ++++
 python/zvec/tool/util.py                      |    2 +-
 24 files changed, 6358 insertions(+), 665 deletions(-)
 create mode 100644 python/zvec/extension/bm25_embedding_function.py
 delete mode 100644 python/zvec/extension/embedding.py
 create mode 100644 python/zvec/extension/embedding_function.py
 create mode 100644 python/zvec/extension/multi_vector_reranker.py
 create mode 100644 python/zvec/extension/openai_embedding_function.py
 create mode 100644 python/zvec/extension/openai_function.py
 create mode 100644 python/zvec/extension/qwen_embedding_function.py
 create mode 100644 python/zvec/extension/qwen_function.py
 create mode 100644 python/zvec/extension/qwen_rerank_function.py
 delete mode 100644 python/zvec/extension/rerank.py
 create mode 100644 python/zvec/extension/rerank_function.py
 create mode 100644 python/zvec/extension/sentence_transformer_embedding_function.py
 create mode 100644 python/zvec/extension/sentence_transformer_function.py
 create mode 100644 python/zvec/extension/sentence_transformer_rerank_function.py

diff --git a/.github/workflows/linux_arm64_docker_ci.yml b/.github/workflows/linux_arm64_docker_ci.yml
index 96a0f32d..5e02a95c 100644
--- a/.github/workflows/linux_arm64_docker_ci.yml
+++ b/.github/workflows/linux_arm64_docker_ci.yml
@@ -69,9 +69,9 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         shell: bash
 
-      - name: Install Ruff
+      - name: Install dependencies
         run: |
-          ${{ env.PIP_BIN }} install --upgrade pip ruff
+          ${{ env.PIP_BIN }} install --upgrade pip ruff==v0.14.4 clang-format==18.1.8 pybind11==3.0 pytest pytest-cov
         shell: bash
 
       - name: Run Ruff Linter
@@ -88,7 +88,6 @@ jobs:
 
       - name: Run clang-format Check
         run: |
-          ${{ env.PIP_BIN }} install clang-format==18.1.8
           cd "$CLEAN_WORKSPACE"
           
 
@@ -120,11 +119,6 @@ jobs:
           ${{ env.PIP_BIN }} install -v . --config-settings='cmake.define.BUILD_TOOLS="ON"'
         shell: bash
 
-      - name: Install test dependencies
-        run: |
-          ${{ env.PIP_BIN }} install pytest pytest-cov
-        shell: bash
-
       - name: Run Python Tests with Coverage
         run: |
           cd "$CLEAN_WORKSPACE"
@@ -133,7 +127,6 @@ jobs:
 
       - name: Run Cpp Tests
         run: |
-          ${{ env.PIP_BIN }} install pybind11==3.0
           cd "$CLEAN_WORKSPACE/build"
           make unittest -j$(nproc)
         shell: bash
diff --git a/.github/workflows/linux_x64_docker_ci.yml b/.github/workflows/linux_x64_docker_ci.yml
index 2edd0995..2c5bb2fd 100644
--- a/.github/workflows/linux_x64_docker_ci.yml
+++ b/.github/workflows/linux_x64_docker_ci.yml
@@ -69,9 +69,9 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         shell: bash
 
-      - name: Install Ruff
+      - name: Install dependencies
         run: |
-          ${{ env.PIP_BIN }} install --upgrade pip ruff
+          ${{ env.PIP_BIN }} install --upgrade pip ruff==v0.14.4 clang-format==18.1.8 pybind11==3.0 pytest pytest-cov
         shell: bash
 
       - name: Run Ruff Linter
@@ -88,7 +88,6 @@ jobs:
 
       - name: Run clang-format Check
         run: |
-          ${{ env.PIP_BIN }} install clang-format==18.1.8 
           cd "$CLEAN_WORKSPACE"
           
           
@@ -120,11 +119,6 @@ jobs:
           ${{ env.PIP_BIN }} install -v . --config-settings='cmake.define.BUILD_TOOLS="ON"'
         shell: bash
 
-      - name: Install test dependencies
-        run: |
-          ${{ env.PIP_BIN }} install pytest pytest-cov
-        shell: bash
-
       - name: Run Python Tests with Coverage
         run: |
           cd "$CLEAN_WORKSPACE"
@@ -133,7 +127,6 @@ jobs:
 
       - name: Run Cpp Tests
         run: |
-          ${{ env.PIP_BIN }} install pybind11==3.0
           cd "$CLEAN_WORKSPACE/build"
           make unittest -j$(nproc)
         shell: bash
diff --git a/pyproject.toml b/pyproject.toml
index dee6728d..de147145 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -217,10 +217,21 @@ ignore = [
     "E731",     # Lambda assignment (used in callbacks)
     "B019",     # `functools.lru_cache` on methods (handled manually)
     "PLR0912",  # Too many branches
+    "PLC0105",  # Ignore contravariant
+    "RUF002",   # Ignore Unicode
 ]
 fixable = ["ALL"]
 unfixable = []
 
+# Ignore all errors in docstrings
+[tool.ruff.lint.pydocstyle]
+convention = "google"  # or "numpy", "pep257"
+ignore-decorators = ["typing.overload"]
+
+[tool.ruff.lint.flake8-type-checking]
+# Don't check code examples in docstrings
+quote-annotations = true
+
 [tool.ruff.lint.isort]
 required-imports = ["from __future__ import annotations"]
 known-first-party = ["zvec"]
@@ -237,6 +248,9 @@ known-first-party = ["zvec"]
 "python/zvec/model/doc.py" = [
     "RUF023",   # Unused sort (for __slot__)
 ]
+"python/zvec/extension/**" = [
+    "PLC0415",  # Import outside top-level (dynamic imports in _get_model)
+]
 
 [tool.ruff.format]
 indent-style = "space"
diff --git a/python/tests/test_embedding.py b/python/tests/test_embedding.py
index 0eb5d6b8..e0a57a17 100644
--- a/python/tests/test_embedding.py
+++ b/python/tests/test_embedding.py
@@ -15,20 +15,31 @@
 
 import os
 from http import HTTPStatus
-from unittest.mock import MagicMock, patch
-
+from unittest.mock import MagicMock, patch, Mock
 
+import numpy as np
 import pytest
-from zvec.extension import QwenEmbeddingFunction
+from zvec.extension import (
+    BM25EmbeddingFunction,
+    DefaultLocalDenseEmbedding,
+    DefaultLocalSparseEmbedding,
+    OpenAIDenseEmbedding,
+    QwenDenseEmbedding,
+    QwenSparseEmbedding,
+)
+
+# Environment variable to control integration tests
+# Set ZVEC_RUN_INTEGRATION_TESTS=1 to run real API/model tests
+RUN_INTEGRATION_TESTS = os.environ.get("ZVEC_RUN_INTEGRATION_TESTS", "0") == "1"
 
 
 # ----------------------------
-# QwenEmbeddingFunction Test Case
+# QwenDenseEmbedding Test Case
 # ----------------------------
-class TestQwenEmbeddingFunction:
+class TestQwenDenseEmbedding:
     def test_init_with_api_key(self):
         # Test initialization with explicit API key
-        embedding_func = QwenEmbeddingFunction(dimension=128, api_key="test_key")
+        embedding_func = QwenDenseEmbedding(dimension=128, api_key="test_key")
         assert embedding_func.dimension == 128
         assert embedding_func.model == "text-embedding-v4"
         assert embedding_func._api_key == "test_key"
@@ -36,33 +47,28 @@ def test_init_with_api_key(self):
     @patch.dict(os.environ, {"DASHSCOPE_API_KEY": "env_key"})
     def test_init_with_env_api_key(self):
         # Test initialization with API key from environment
-        embedding_func = QwenEmbeddingFunction(dimension=128)
+        embedding_func = QwenDenseEmbedding(dimension=128)
         assert embedding_func._api_key == "env_key"
 
-    def test_init_without_api_key(self):
-        # Test initialization without API key raises ValueError
-        with pytest.raises(ValueError, match="DashScope API key is required"):
-            QwenEmbeddingFunction(dimension=128)
-
     @patch.dict(os.environ, {"DASHSCOPE_API_KEY": ""})
     def test_init_with_empty_env_api_key(self):
         # Test initialization with empty API key from environment
         with pytest.raises(ValueError, match="DashScope API key is required"):
-            QwenEmbeddingFunction(dimension=128)
+            QwenDenseEmbedding(dimension=128)
 
     def test_model_property(self):
-        embedding_func = QwenEmbeddingFunction(dimension=128, api_key="test_key")
+        embedding_func = QwenDenseEmbedding(dimension=128, api_key="test_key")
         assert embedding_func.model == "text-embedding-v4"
 
-        embedding_func = QwenEmbeddingFunction(
+        embedding_func = QwenDenseEmbedding(
             dimension=128, model="custom-model", api_key="test_key"
         )
         assert embedding_func.model == "custom-model"
 
-    @patch("zvec.extension.embedding.require_module")
+    @patch("zvec.extension.qwen_function.require_module")
     def test_embed_with_empty_text(self, mock_require_module):
         # Test embed method with empty text raises ValueError
-        embedding_func = QwenEmbeddingFunction(dimension=128, api_key="test_key")
+        embedding_func = QwenDenseEmbedding(dimension=128, api_key="test_key")
 
         with pytest.raises(
             ValueError, match="Input text cannot be empty or whitespace only"
@@ -72,7 +78,7 @@ def test_embed_with_empty_text(self, mock_require_module):
         with pytest.raises(TypeError):
             embedding_func.embed(None)
 
-    @patch("zvec.extension.embedding.require_module")
+    @patch("zvec.extension.qwen_function.require_module")
     def test_embed_success(self, mock_require_module):
         # Test successful embedding
         mock_dashscope = MagicMock()
@@ -82,18 +88,20 @@ def test_embed_success(self, mock_require_module):
         mock_dashscope.TextEmbedding.call.return_value = mock_response
         mock_require_module.return_value = mock_dashscope
 
-        embedding_func = QwenEmbeddingFunction(dimension=128, api_key="test_key")
+        embedding_func = QwenDenseEmbedding(dimension=3, api_key="test_key")
+        # Clear cache to avoid interference
+        embedding_func.embed.cache_clear()
         result = embedding_func.embed("test text")
 
         assert result == [0.1, 0.2, 0.3]
         mock_dashscope.TextEmbedding.call.assert_called_once_with(
             model="text-embedding-v4",
             input="test text",
-            dimension=128,
+            dimension=3,
             output_type="dense",
         )
 
-    @patch("zvec.extension.embedding.require_module")
+    @patch("zvec.extension.qwen_function.require_module")
     def test_embed_http_error(self, mock_require_module):
         # Test embedding with HTTP error
         mock_dashscope = MagicMock()
@@ -103,29 +111,1989 @@ def test_embed_http_error(self, mock_require_module):
         mock_dashscope.TextEmbedding.call.return_value = mock_response
         mock_require_module.return_value = mock_dashscope
 
-        embedding_func = QwenEmbeddingFunction(dimension=128, api_key="test_key")
+        embedding_func = QwenDenseEmbedding(dimension=128, api_key="test_key")
+        embedding_func.embed.cache_clear()
 
         with pytest.raises(ValueError):
             embedding_func.embed("test text")
 
-    @patch("zvec.extension.embedding.require_module")
+    @patch("zvec.extension.qwen_function.require_module")
     def test_embed_invalid_response(self, mock_require_module):
         # Test embedding with invalid response (wrong number of embeddings)
         mock_dashscope = MagicMock()
         mock_response = MagicMock()
         mock_response.status_code = HTTPStatus.OK
-        mock_response.output.embeddings = []
+        mock_response.output = {"embeddings": []}
         mock_dashscope.TextEmbedding.call.return_value = mock_response
         mock_require_module.return_value = mock_dashscope
 
-        embedding_func = QwenEmbeddingFunction(dimension=128, api_key="test_key")
+        embedding_func = QwenDenseEmbedding(dimension=128, api_key="test_key")
+        embedding_func.embed.cache_clear()
 
         with pytest.raises(ValueError):
             embedding_func.embed("test text")
 
-    @pytest.mark.skip(reason="Qwen Embedding is not available in CI")
-    def test_embed(self):
-        # Test embedding with invalid dimension
-        embedding_func = QwenEmbeddingFunction(dimension=128, api_key="xxx")
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    def test_real_embed_success(self):
+        """Integration test with real DashScope API.
+
+        To run this test, set environment variable:
+            export ZVEC_RUN_INTEGRATION_TESTS=1
+            export DASHSCOPE_API_KEY=your-api-key
+        """
+        embedding_func = QwenDenseEmbedding(dimension=128)
         dense = embedding_func("test text")
         assert len(dense) == 128
+
+
+# ----------------------------
+# QwenSparseEmbedding Test Case
+# ----------------------------
+class TestQwenSparseEmbedding:
+    """Test suite for QwenSparseEmbedding (Qwen sparse embedding via DashScope API)."""
+
+    def test_init_with_api_key(self):
+        """Test initialization with explicit API key."""
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        assert embedding_func._dimension == 1024
+        assert embedding_func.model == "text-embedding-v4"
+        assert embedding_func._api_key == "test_key"
+        # encoding_type defaults to "query" via extra_params
+        assert embedding_func.extra_params.get("encoding_type", "query") == "query"
+
+    def test_init_with_custom_encoding_type(self):
+        """Test initialization with custom encoding type."""
+        embedding_func = QwenSparseEmbedding(
+            dimension=1024, encoding_type="document", api_key="test_key"
+        )
+        assert embedding_func.extra_params.get("encoding_type") == "document"
+
+    @patch.dict(os.environ, {"DASHSCOPE_API_KEY": "env_key"})
+    def test_init_with_env_api_key(self):
+        """Test initialization with API key from environment."""
+        embedding_func = QwenSparseEmbedding(dimension=1024)
+        assert embedding_func._api_key == "env_key"
+
+    @patch.dict(os.environ, {"DASHSCOPE_API_KEY": ""})
+    def test_init_without_api_key(self):
+        """Test initialization fails without API key."""
+        with pytest.raises(ValueError, match="DashScope API key is required"):
+            QwenSparseEmbedding(dimension=1024)
+
+    def test_model_property(self):
+        """Test model property."""
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        assert embedding_func.model == "text-embedding-v4"
+
+        embedding_func = QwenSparseEmbedding(
+            dimension=1024, model="text-embedding-v3", api_key="test_key"
+        )
+        assert embedding_func.model == "text-embedding-v3"
+
+    def test_encoding_type_property(self):
+        """Test encoding_type via extra_params."""
+        query_emb = QwenSparseEmbedding(
+            dimension=1024, encoding_type="query", api_key="test_key"
+        )
+        assert query_emb.extra_params.get("encoding_type") == "query"
+
+        doc_emb = QwenSparseEmbedding(
+            dimension=1024, encoding_type="document", api_key="test_key"
+        )
+        assert doc_emb.extra_params.get("encoding_type") == "document"
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_with_empty_text(self, mock_require_module):
+        """Test embed method with empty text raises ValueError."""
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+
+        with pytest.raises(
+            ValueError, match="Input text cannot be empty or whitespace only"
+        ):
+            embedding_func.embed("")
+
+        with pytest.raises(
+            ValueError, match="Input text cannot be empty or whitespace only"
+        ):
+            embedding_func.embed("   ")
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_with_non_string_input(self, mock_require_module):
+        """Test embed method with non-string input raises TypeError."""
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+
+        with pytest.raises(TypeError, match="Expected 'input' to be str"):
+            embedding_func.embed(123)
+
+        with pytest.raises(TypeError, match="Expected 'input' to be str"):
+            embedding_func.embed(None)
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_success(self, mock_require_module):
+        """Test successful sparse embedding generation."""
+        mock_dashscope = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = HTTPStatus.OK
+        # Sparse embedding returns array of {index, value, token} objects
+        mock_response.output = {
+            "embeddings": [
+                {
+                    "sparse_embedding": [
+                        {"index": 10, "value": 0.5, "token": "机器"},
+                        {"index": 245, "value": 0.8, "token": "学习"},
+                        {"index": 1023, "value": 1.2, "token": "算法"},
+                    ]
+                }
+            ]
+        }
+        mock_dashscope.TextEmbedding.call.return_value = mock_response
+        mock_require_module.return_value = mock_dashscope
+
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        # Clear cache to avoid interference
+        embedding_func.embed.cache_clear()
+        result = embedding_func.embed("test text")
+
+        # Verify result is a dict
+        assert isinstance(result, dict)
+        # Verify keys are integers
+        assert all(isinstance(k, int) for k in result.keys())
+        # Verify values are floats
+        assert all(isinstance(v, float) for v in result.values())
+        # Verify all values are positive
+        assert all(v > 0 for v in result.values())
+        # Verify sorted by indices
+        keys = list(result.keys())
+        assert keys == sorted(keys)
+        # Verify specific keys
+        assert keys == [10, 245, 1023]
+
+        mock_dashscope.TextEmbedding.call.assert_called_once_with(
+            model="text-embedding-v4",
+            input="test text",
+            dimension=1024,
+            output_type="sparse",
+            text_type="query",
+        )
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_with_document_encoding_type(self, mock_require_module):
+        """Test embedding with document encoding type."""
+        mock_dashscope = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = HTTPStatus.OK
+        mock_response.output = {
+            "embeddings": [
+                {
+                    "sparse_embedding": [
+                        {"index": 5, "value": 0.3, "token": "文档"},
+                        {"index": 100, "value": 0.7, "token": "内容"},
+                        {"index": 500, "value": 0.9, "token": "检索"},
+                    ]
+                }
+            ]
+        }
+        mock_dashscope.TextEmbedding.call.return_value = mock_response
+        mock_require_module.return_value = mock_dashscope
+
+        embedding_func = QwenSparseEmbedding(
+            dimension=1024, encoding_type="document", api_key="test_key"
+        )
+        embedding_func.embed.cache_clear()
+        result = embedding_func.embed("test document")
+
+        assert isinstance(result, dict)
+        assert list(result.keys()) == [5, 100, 500]
+
+        # Verify text_type parameter is "document"
+        call_args = mock_dashscope.TextEmbedding.call.call_args
+        assert call_args[1]["text_type"] == "document"
+        assert call_args[1]["output_type"] == "sparse"
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_output_sorted_by_indices(self, mock_require_module):
+        """Test that output is always sorted by indices in ascending order."""
+        mock_dashscope = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = HTTPStatus.OK
+        # Return unsorted indices
+        mock_response.output = {
+            "embeddings": [
+                {
+                    "sparse_embedding": [
+                        {"index": 9999, "value": 1.5, "token": "A"},
+                        {"index": 5, "value": 2.0, "token": "B"},
+                        {"index": 1234, "value": 0.8, "token": "C"},
+                        {"index": 77, "value": 3.2, "token": "D"},
+                        {"index": 500, "value": 1.1, "token": "E"},
+                    ]
+                }
+            ]
+        }
+        mock_dashscope.TextEmbedding.call.return_value = mock_response
+        mock_require_module.return_value = mock_dashscope
+
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        embedding_func.embed.cache_clear()
+        result = embedding_func.embed("test sorting")
+
+        # Verify keys are sorted
+        result_keys = list(result.keys())
+        assert result_keys == sorted(result_keys)
+        # Verify expected sorted order
+        assert result_keys == [5, 77, 500, 1234, 9999]
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_filters_zero_values(self, mock_require_module):
+        """Test that zero and negative values are filtered out."""
+        mock_dashscope = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = HTTPStatus.OK
+        # Include zero and negative values
+        mock_response.output = {
+            "embeddings": [
+                {
+                    "sparse_embedding": [
+                        {"index": 10, "value": 0.5, "token": "正"},
+                        {
+                            "index": 20,
+                            "value": 0.0,
+                            "token": "零",
+                        },  # Should be filtered
+                        {
+                            "index": 30,
+                            "value": -0.3,
+                            "token": "负",
+                        },  # Should be filtered
+                        {"index": 40, "value": 0.8, "token": "正"},
+                        {
+                            "index": 50,
+                            "value": 0.0,
+                            "token": "零",
+                        },  # Should be filtered
+                    ]
+                }
+            ]
+        }
+        mock_dashscope.TextEmbedding.call.return_value = mock_response
+        mock_require_module.return_value = mock_dashscope
+
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        embedding_func.embed.cache_clear()
+        result = embedding_func.embed("test filtering")
+
+        # Only positive values should remain
+        assert list(result.keys()) == [10, 40]
+        assert all(v > 0 for v in result.values())
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_http_error(self, mock_require_module):
+        """Test embedding with HTTP error."""
+        mock_dashscope = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = HTTPStatus.BAD_REQUEST
+        mock_response.message = "Bad Request"
+        mock_dashscope.TextEmbedding.call.return_value = mock_response
+        mock_require_module.return_value = mock_dashscope
+
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        embedding_func.embed.cache_clear()
+
+        with pytest.raises(ValueError, match="DashScope API error"):
+            embedding_func.embed("test text")
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_invalid_response_no_embeddings(self, mock_require_module):
+        """Test embedding with invalid response (no embeddings)."""
+        mock_dashscope = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = HTTPStatus.OK
+        mock_response.output = {"embeddings": []}
+        mock_dashscope.TextEmbedding.call.return_value = mock_response
+        mock_require_module.return_value = mock_dashscope
+
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        embedding_func.embed.cache_clear()
+
+        with pytest.raises(ValueError, match="Expected exactly 1 embedding"):
+            embedding_func.embed("test text")
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_invalid_response_not_dict(self, mock_require_module):
+        """Test embedding with invalid response (sparse_embedding not list)."""
+        mock_dashscope = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = HTTPStatus.OK
+        # sparse_embedding should be list, not dict
+        mock_response.output = {
+            "embeddings": [{"sparse_embedding": {"index": 10, "value": 0.5}}]
+        }
+        mock_dashscope.TextEmbedding.call.return_value = mock_response
+        mock_require_module.return_value = mock_dashscope
+
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        embedding_func.embed.cache_clear()
+
+        with pytest.raises(
+            ValueError, match="'sparse_embedding' field is missing or not a list"
+        ):
+            embedding_func.embed("test text")
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_callable_interface(self, mock_require_module):
+        """Test that embedding function is callable."""
+        mock_dashscope = MagicMock()
+        mock_response = MagicMock()
+        mock_response.status_code = HTTPStatus.OK
+        mock_response.output = {
+            "embeddings": [
+                {
+                    "sparse_embedding": [
+                        {"index": 100, "value": 1.0, "token": "测试"},
+                        {"index": 200, "value": 0.5, "token": "调用"},
+                    ]
+                }
+            ]
+        }
+        mock_dashscope.TextEmbedding.call.return_value = mock_response
+        mock_require_module.return_value = mock_dashscope
+
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        embedding_func.embed.cache_clear()
+
+        # Test calling the function directly
+        result = embedding_func("test text")
+        assert isinstance(result, dict)
+        assert list(result.keys()) == [100, 200]
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_embed_api_connection_error(self, mock_require_module):
+        """Test handling of API connection errors."""
+        mock_dashscope = MagicMock()
+        mock_dashscope.TextEmbedding.call.side_effect = Exception("Connection timeout")
+        mock_require_module.return_value = mock_dashscope
+
+        embedding_func = QwenSparseEmbedding(dimension=1024, api_key="test_key")
+        embedding_func.embed.cache_clear()
+
+        with pytest.raises(RuntimeError, match="Failed to call DashScope API"):
+            embedding_func.embed("test text")
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    def test_real_embed_success(self):
+        """Integration test with real DashScope API.
+
+        To run this test, set environment variable:
+            export ZVEC_RUN_INTEGRATION_TESTS=1
+            export DASHSCOPE_API_KEY=your-api-key
+        """
+        # Test query embedding
+        query_emb = QwenSparseEmbedding(dimension=1024, encoding_type="query")
+        query_vec = query_emb.embed("machine learning")
+
+        assert isinstance(query_vec, dict)
+        assert len(query_vec) > 0
+        assert all(isinstance(k, int) for k in query_vec.keys())
+        assert all(isinstance(v, float) and v > 0 for v in query_vec.values())
+
+        # Verify sorted output
+        keys = list(query_vec.keys())
+        assert keys == sorted(keys)
+
+        # Test document embedding
+        doc_emb = QwenSparseEmbedding(dimension=1024, encoding_type="document")
+        doc_vec = doc_emb.embed("Machine learning is a subset of AI")
+
+        assert isinstance(doc_vec, dict)
+        assert len(doc_vec) > 0
+
+        # Verify sorted output
+        doc_keys = list(doc_vec.keys())
+        assert doc_keys == sorted(doc_keys)
+
+
+# ----------------------------
+# OpenAIDenseEmbedding Test Case
+# ----------------------------
+class TestOpenAIDenseEmbedding:
+    def test_init_with_api_key(self):
+        """Test initialization with explicit API key."""
+        embedding_func = OpenAIDenseEmbedding(api_key="sk-test-key")
+        assert embedding_func.dimension == 1536  # Default for text-embedding-3-small
+        assert embedding_func.model == "text-embedding-3-small"
+        assert embedding_func._api_key == "sk-test-key"
+
+    @patch.dict(os.environ, {"OPENAI_API_KEY": "sk-env-key"})
+    def test_init_with_env_api_key(self):
+        """Test initialization with API key from environment."""
+        embedding_func = OpenAIDenseEmbedding()
+        assert embedding_func._api_key == "sk-env-key"
+
+    @patch.dict(os.environ, {"OPENAI_API_KEY": ""})
+    def test_init_without_api_key(self):
+        """Test initialization fails without API key."""
+        with pytest.raises(ValueError, match="OpenAI API key is required"):
+            OpenAIDenseEmbedding()
+
+    def test_init_with_custom_dimension(self):
+        """Test initialization with custom dimension."""
+        embedding_func = OpenAIDenseEmbedding(
+            model="text-embedding-3-large", dimension=1024, api_key="sk-test"
+        )
+        assert embedding_func.dimension == 1024
+        assert embedding_func.model == "text-embedding-3-large"
+
+    def test_init_with_base_url(self):
+        """Test initialization with custom base URL."""
+        embedding_func = OpenAIDenseEmbedding(
+            api_key="sk-test", base_url="https://custom.openai.com/"
+        )
+        assert embedding_func._base_url == "https://custom.openai.com/"
+
+    def test_model_property(self):
+        """Test model property."""
+        embedding_func = OpenAIDenseEmbedding(api_key="sk-test")
+        assert embedding_func.model == "text-embedding-3-small"
+
+        embedding_func = OpenAIDenseEmbedding(
+            model="text-embedding-ada-002", api_key="sk-test"
+        )
+        assert embedding_func.model == "text-embedding-ada-002"
+
+    def test_extra_params(self):
+        """Test extra_params property."""
+        # Test without extra params
+        embedding_func = OpenAIDenseEmbedding(api_key="sk-test")
+        assert embedding_func.extra_params == {}
+
+        # Test with extra params
+        embedding_func = OpenAIDenseEmbedding(
+            api_key="sk-test",
+            encoding_format="float",
+            user="test-user",
+        )
+        assert embedding_func.extra_params == {
+            "encoding_format": "float",
+            "user": "test-user",
+        }
+
+    @patch("zvec.extension.openai_function.require_module")
+    def test_embed_with_empty_text(self, mock_require_module):
+        """Test embed method with empty text raises ValueError."""
+        embedding_func = OpenAIDenseEmbedding(api_key="sk-test")
+
+        with pytest.raises(
+            ValueError, match="Input text cannot be empty or whitespace only"
+        ):
+            embedding_func.embed("")
+
+        with pytest.raises(
+            ValueError, match="Input text cannot be empty or whitespace only"
+        ):
+            embedding_func.embed("   ")
+
+    @patch("zvec.extension.openai_function.require_module")
+    def test_embed_with_non_string_input(self, mock_require_module):
+        """Test embed method with non-string input raises TypeError."""
+        embedding_func = OpenAIDenseEmbedding(api_key="sk-test")
+
+        with pytest.raises(TypeError, match="Expected 'input' to be str"):
+            embedding_func.embed(123)
+
+        with pytest.raises(TypeError, match="Expected 'input' to be str"):
+            embedding_func.embed(None)
+
+    @patch("zvec.extension.openai_function.require_module")
+    def test_embed_success(self, mock_require_module):
+        """Test successful embedding generation."""
+        # Mock OpenAI client
+        mock_openai = Mock()
+        mock_client = Mock()
+        mock_response = Mock()
+
+        # Create mock embedding data
+        fake_embedding = [0.1, 0.2, 0.3]
+        mock_embedding_obj = Mock()
+        mock_embedding_obj.embedding = fake_embedding
+        mock_response.data = [mock_embedding_obj]
+
+        mock_client.embeddings.create.return_value = mock_response
+        mock_openai.OpenAI.return_value = mock_client
+        mock_require_module.return_value = mock_openai
+
+        embedding_func = OpenAIDenseEmbedding(dimension=3, api_key="sk-test")
+        embedding_func.embed.cache_clear()
+        result = embedding_func.embed("test text")
+
+        assert result == [0.1, 0.2, 0.3]
+        mock_client.embeddings.create.assert_called_once_with(
+            model="text-embedding-3-small", input="test text", dimensions=3
+        )
+
+    @patch("zvec.extension.openai_function.require_module")
+    def test_embed_with_custom_model(self, mock_require_module):
+        """Test embedding with custom model."""
+        mock_openai = Mock()
+        mock_client = Mock()
+        mock_response = Mock()
+
+        fake_embedding = [0.1] * 1536
+        mock_embedding_obj = Mock()
+        mock_embedding_obj.embedding = fake_embedding
+        mock_response.data = [mock_embedding_obj]
+
+        mock_client.embeddings.create.return_value = mock_response
+        mock_openai.OpenAI.return_value = mock_client
+        mock_require_module.return_value = mock_openai
+
+        embedding_func = OpenAIDenseEmbedding(
+            model="text-embedding-ada-002", api_key="sk-test"
+        )
+        embedding_func.embed.cache_clear()
+        result = embedding_func.embed("test text")
+
+        assert len(result) == 1536
+        mock_client.embeddings.create.assert_called_once_with(
+            model="text-embedding-ada-002", input="test text"
+        )
+
+    @patch("zvec.extension.openai_function.require_module")
+    def test_embed_api_error(self, mock_require_module):
+        """Test handling of API errors."""
+        mock_openai = Mock()
+        mock_client = Mock()
+
+        # Simulate API error
+        api_error = Mock()
+        api_error.__class__.__name__ = "APIError"
+        mock_openai.APIError = type("APIError", (Exception,), {})
+        mock_openai.APIConnectionError = type("APIConnectionError", (Exception,), {})
+
+        mock_client.embeddings.create.side_effect = mock_openai.APIError(
+            "Rate limit exceeded"
+        )
+        mock_openai.OpenAI.return_value = mock_client
+        mock_require_module.return_value = mock_openai
+
+        embedding_func = OpenAIDenseEmbedding(api_key="sk-test")
+        embedding_func.embed.cache_clear()
+
+        with pytest.raises(RuntimeError, match="Failed to call OpenAI API"):
+            embedding_func.embed("test text")
+
+    @patch("zvec.extension.openai_function.require_module")
+    def test_embed_invalid_response(self, mock_require_module):
+        """Test handling of invalid API response."""
+        mock_openai = Mock()
+        mock_client = Mock()
+        mock_response = Mock()
+
+        # Empty response data
+        mock_response.data = []
+
+        mock_client.embeddings.create.return_value = mock_response
+        mock_openai.OpenAI.return_value = mock_client
+        mock_openai.APIError = type("APIError", (Exception,), {})
+        mock_openai.APIConnectionError = type("APIConnectionError", (Exception,), {})
+        mock_require_module.return_value = mock_openai
+
+        embedding_func = OpenAIDenseEmbedding(api_key="sk-test")
+        embedding_func.embed.cache_clear()
+
+        with pytest.raises(ValueError, match="no embedding data returned"):
+            embedding_func.embed("test text")
+
+    @patch("zvec.extension.openai_function.require_module")
+    def test_embed_dimension_mismatch(self, mock_require_module):
+        """Test handling of dimension mismatch."""
+        mock_openai = Mock()
+        mock_client = Mock()
+        mock_response = Mock()
+
+        # Return embedding with wrong dimension
+        fake_embedding = [0.1] * 512
+        mock_embedding_obj = Mock()
+        mock_embedding_obj.embedding = fake_embedding
+        mock_response.data = [mock_embedding_obj]
+
+        mock_client.embeddings.create.return_value = mock_response
+        mock_openai.OpenAI.return_value = mock_client
+        mock_openai.APIError = type("APIError", (Exception,), {})
+        mock_openai.APIConnectionError = type("APIConnectionError", (Exception,), {})
+        mock_require_module.return_value = mock_openai
+
+        embedding_func = OpenAIDenseEmbedding(dimension=1536, api_key="sk-test")
+        embedding_func.embed.cache_clear()
+
+        with pytest.raises(ValueError, match="Dimension mismatch"):
+            embedding_func.embed("test text")
+
+    @patch("zvec.extension.openai_function.require_module")
+    def test_embed_callable(self, mock_require_module):
+        """Test that embedding function is callable."""
+        mock_openai = Mock()
+        mock_client = Mock()
+        mock_response = Mock()
+
+        fake_embedding = [0.1] * 1536
+        mock_embedding_obj = Mock()
+        mock_embedding_obj.embedding = fake_embedding
+        mock_response.data = [mock_embedding_obj]
+
+        mock_client.embeddings.create.return_value = mock_response
+        mock_openai.OpenAI.return_value = mock_client
+        mock_openai.APIError = type("APIError", (Exception,), {})
+        mock_openai.APIConnectionError = type("APIConnectionError", (Exception,), {})
+        mock_require_module.return_value = mock_openai
+
+        embedding_func = OpenAIDenseEmbedding(api_key="sk-test")
+        embedding_func.embed.cache_clear()
+
+        # Test calling the function directly
+        result = embedding_func("test text")
+        assert isinstance(result, list)
+        assert len(result) == 1536
+
+    @patch("zvec.extension.openai_function.require_module")
+    def test_embed_with_base_url(self, mock_require_module):
+        """Test embedding with custom base URL."""
+        mock_openai = Mock()
+        mock_client = Mock()
+        mock_response = Mock()
+
+        fake_embedding = [0.1] * 1536
+        mock_embedding_obj = Mock()
+        mock_embedding_obj.embedding = fake_embedding
+        mock_response.data = [mock_embedding_obj]
+
+        mock_client.embeddings.create.return_value = mock_response
+        mock_openai.OpenAI.return_value = mock_client
+        mock_openai.APIError = type("APIError", (Exception,), {})
+        mock_openai.APIConnectionError = type("APIConnectionError", (Exception,), {})
+        mock_require_module.return_value = mock_openai
+
+        embedding_func = OpenAIDenseEmbedding(
+            api_key="sk-test", base_url="https://custom.openai.com/"
+        )
+        embedding_func.embed.cache_clear()
+        result = embedding_func.embed("test text")
+
+        # Verify client was created with custom base URL
+        mock_openai.OpenAI.assert_called_once_with(
+            api_key="sk-test", base_url="https://custom.openai.com/"
+        )
+        assert len(result) == 1536
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    def test_real_embed_success(self):
+        """Integration test with real OpenAI API.
+
+        To run this test, set environment variable:
+            export ZVEC_RUN_INTEGRATION_TESTS=1
+            export OPENAI_API_KEY=sk-...
+        """
+        embedding_func = OpenAIDenseEmbedding(
+            model="text-embedding-v4",
+            dimension=256,
+            base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+        )
+        vector = embedding_func.embed("Hello, world!")
+        assert len(vector) == 256
+        assert isinstance(vector, list)
+        assert all(isinstance(x, float) for x in vector)
+
+
+# ----------------------------
+# DefaultLocalDenseEmbedding Test Case
+# ----------------------------
+class TestDefaultLocalDenseEmbedding:
+    """Test cases for DefaultLocalDenseEmbedding."""
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_init_success(self, mock_require_module):
+        """Test successful initialization with mocked model."""
+        # Mock sentence_transformers module
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+        mock_model.device = "cpu"
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        # Initialize embedding function
+        emb_func = DefaultLocalDenseEmbedding()
+
+        # Assertions
+        assert emb_func.dimension == 384
+        assert emb_func.model_name == "all-MiniLM-L6-v2"
+        assert emb_func.model_source == "huggingface"
+        assert emb_func.device == "cpu"
+        mock_st.SentenceTransformer.assert_called_once_with(
+            "all-MiniLM-L6-v2", device=None, trust_remote_code=True
+        )
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_init_with_custom_device(self, mock_require_module):
+        """Test initialization with custom device."""
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+        mock_model.device = "cuda"
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        emb_func = DefaultLocalDenseEmbedding(device="cuda")
+
+        assert emb_func.device == "cuda"
+        mock_st.SentenceTransformer.assert_called_once_with(
+            "all-MiniLM-L6-v2", device="cuda", trust_remote_code=True
+        )
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_init_with_modelscope(self, mock_require_module):
+        """Test initialization with ModelScope as model source."""
+        mock_st = Mock()
+        mock_ms = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+        mock_model.device = "cpu"
+        mock_st.SentenceTransformer.return_value = mock_model
+
+        def require_module_side_effect(module_name):
+            if module_name == "sentence_transformers":
+                return mock_st
+            elif module_name == "modelscope":
+                return mock_ms
+            raise ImportError(f"No module named '{module_name}'")
+
+        mock_require_module.side_effect = require_module_side_effect
+
+        # Mock snapshot_download at the correct import location
+        with patch(
+            "modelscope.hub.snapshot_download.snapshot_download",
+            return_value="/path/to/cached/model",
+        ):
+            emb_func = DefaultLocalDenseEmbedding(model_source="modelscope")
+
+        # Assertions
+        assert emb_func.dimension == 384
+        assert emb_func.model_name == "iic/nlp_gte_sentence-embedding_chinese-small"
+        assert emb_func.model_source == "modelscope"
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_init_with_invalid_model_source(self, mock_require_module):
+        """Test initialization with invalid model_source raises ValueError."""
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        with pytest.raises(ValueError, match="Invalid model_source"):
+            DefaultLocalDenseEmbedding(model_source="invalid_source")
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_embed_success(self, mock_require_module):
+        """Test successful embedding generation."""
+        # Mock embedding output
+        fake_embedding = np.random.rand(384).astype(np.float32)
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+
+        # Configure encode method
+        mock_model.encode = Mock(return_value=fake_embedding)
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        emb_func = DefaultLocalDenseEmbedding()
+        result = emb_func.embed("Hello, world!")
+
+        # Assertions
+        assert isinstance(result, list)
+        assert len(result) == 384
+        assert all(isinstance(x, float) for x in result)
+        mock_model.encode.assert_called_once_with(
+            "Hello, world!",
+            convert_to_numpy=True,
+            normalize_embeddings=True,
+            batch_size=32,
+        )
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_embed_with_normalization(self, mock_require_module):
+        """Test embedding with L2 normalization."""
+        # Create a normalized vector
+        fake_embedding = np.random.rand(384).astype(np.float32)
+        fake_embedding = fake_embedding / np.linalg.norm(fake_embedding)
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+
+        # Configure encode method
+        mock_model.encode = Mock(return_value=fake_embedding)
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        emb_func = DefaultLocalDenseEmbedding(normalize_embeddings=True)
+        result = emb_func.embed("Test sentence")
+
+        # Check if vector is normalized (L2 norm should be close to 1.0)
+        result_array = np.array(result)
+        norm = np.linalg.norm(result_array)
+        assert abs(norm - 1.0) < 1e-5
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_embed_empty_string(self, mock_require_module):
+        """Test embedding with empty string raises ValueError."""
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        emb_func = DefaultLocalDenseEmbedding()
+
+        with pytest.raises(ValueError, match="Input text cannot be empty"):
+            emb_func.embed("")
+
+        with pytest.raises(ValueError, match="Input text cannot be empty"):
+            emb_func.embed("   ")
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_embed_non_string_input(self, mock_require_module):
+        """Test embedding with non-string input raises TypeError."""
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        emb_func = DefaultLocalDenseEmbedding()
+
+        with pytest.raises(TypeError, match="Expected 'input' to be str"):
+            emb_func.embed(123)
+
+        with pytest.raises(TypeError, match="Expected 'input' to be str"):
+            emb_func.embed(None)
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_embed_callable(self, mock_require_module):
+        """Test that embedding function is callable."""
+        fake_embedding = np.random.rand(384).astype(np.float32)
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+
+        # Configure encode method
+        mock_model.encode = Mock(return_value=fake_embedding)
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        emb_func = DefaultLocalDenseEmbedding()
+
+        # Test calling the function directly
+        result = emb_func("Test text")
+        assert isinstance(result, list)
+        assert len(result) == 384
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_semantic_similarity(self, mock_require_module):
+        """Test semantic similarity between similar and different texts."""
+        # Create mock embeddings for similar and different texts
+        similar_emb_1 = np.array([1.0, 0.0, 0.0] + [0.0] * 381, dtype=np.float32)
+        similar_emb_2 = np.array([0.9, 0.1, 0.0] + [0.0] * 381, dtype=np.float32)
+        different_emb = np.array([0.0, 0.0, 1.0] + [0.0] * 381, dtype=np.float32)
+
+        # Normalize
+        similar_emb_1 = similar_emb_1 / np.linalg.norm(similar_emb_1)
+        similar_emb_2 = similar_emb_2 / np.linalg.norm(similar_emb_2)
+        different_emb = different_emb / np.linalg.norm(different_emb)
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+
+        # Configure encode method with side_effect for multiple calls
+        mock_model.encode = Mock(
+            side_effect=[similar_emb_1, similar_emb_2, different_emb]
+        )
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        emb_func = DefaultLocalDenseEmbedding()
+
+        v1 = emb_func.embed("The cat sits on the mat")
+        v2 = emb_func.embed("A feline rests on a rug")
+        v3 = emb_func.embed("Python programming")
+
+        # Calculate similarities
+        similarity_high = np.dot(v1, v2)
+        similarity_low = np.dot(v1, v3)
+
+        assert similarity_high > similarity_low
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_model_loading_error(self, mock_require_module):
+        """Test handling of model loading failure."""
+        # Clear model cache
+        from zvec.extension.sentence_transformer_embedding_function import (
+            DefaultLocalSparseEmbedding,
+        )
+
+        DefaultLocalSparseEmbedding.clear_cache()
+        mock_st = Mock()
+        mock_st.SentenceTransformer.side_effect = Exception("Model not found")
+        mock_require_module.return_value = mock_st
+
+        with pytest.raises(
+            ValueError, match="Failed to load Sentence Transformer model"
+        ):
+            DefaultLocalDenseEmbedding()
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_modelscope_import_error(self, mock_require_module):
+        """Test handling of ModelScope import error."""
+        mock_st = Mock()
+
+        def require_module_side_effect(module_name):
+            if module_name == "sentence_transformers":
+                return mock_st
+            elif module_name == "modelscope":
+                raise ImportError("No module named 'modelscope'")
+
+        mock_require_module.side_effect = require_module_side_effect
+
+        with pytest.raises(
+            ImportError, match="ModelScope support requires the 'modelscope' package"
+        ):
+            DefaultLocalDenseEmbedding(model_source="modelscope")
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_embed_dimension_mismatch(self, mock_require_module):
+        """Test handling of dimension mismatch in embedding output."""
+        # Return embedding with wrong dimension
+        fake_embedding = np.random.rand(256).astype(np.float32)
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+
+        # Configure encode method
+        mock_model.encode = Mock(return_value=fake_embedding)
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        emb_func = DefaultLocalDenseEmbedding()
+
+        with pytest.raises(ValueError, match="Dimension mismatch"):
+            emb_func.embed("Test text")
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    def test_real_embedding_generation(self):
+        """Integration test with real model (requires sentence-transformers).
+
+        To run this test, set environment variable:
+            export ZVEC_RUN_INTEGRATION_TESTS=1
+
+        Note: First run will download the model (~80MB).
+        """
+        emb_func = DefaultLocalDenseEmbedding()
+
+        # Test basic embedding
+        vector = emb_func.embed("Hello, world!")
+        assert len(vector) == 384
+        assert isinstance(vector, list)
+        assert all(isinstance(x, float) for x in vector)
+
+        # Test normalization
+        norm = np.linalg.norm(vector)
+        assert abs(norm - 1.0) < 1e-5
+
+        # Test semantic similarity
+        v1 = emb_func.embed("The cat sits on the mat")
+        v2 = emb_func.embed("A feline rests on a rug")
+        v3 = emb_func.embed("Python programming language")
+
+        similarity_high = np.dot(v1, v2)
+        similarity_low = np.dot(v1, v3)
+        assert similarity_high > similarity_low
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_model_properties(self, mock_require_module):
+        """Test model_name and model_source properties."""
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.get_sentence_embedding_dimension.return_value = 384
+        mock_model.device = "cpu"
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        # Test Hugging Face
+        emb_func_hf = DefaultLocalDenseEmbedding(model_source="huggingface")
+        assert emb_func_hf.model_name == "all-MiniLM-L6-v2"
+        assert emb_func_hf.model_source == "huggingface"
+
+        # Test ModelScope
+        with patch(
+            "modelscope.hub.snapshot_download.snapshot_download",
+            return_value="/path/to/model",
+        ):
+            mock_ms = Mock()
+            mock_require_module.side_effect = (
+                lambda m: mock_st if m == "sentence_transformers" else mock_ms
+            )
+            emb_func_ms = DefaultLocalDenseEmbedding(model_source="modelscope")
+            assert (
+                emb_func_ms.model_name == "iic/nlp_gte_sentence-embedding_chinese-small"
+            )
+            assert emb_func_ms.model_source == "modelscope"
+
+
+# -----------------------------------
+# DefaultLocalSparseEmbedding Test Case
+# -----------------------------------
+class TestDefaultLocalSparseEmbedding:
+    """Test suite for DefaultLocalSparseEmbedding (SPLADE sparse embedding).
+
+    Note:
+        DefaultLocalSparseEmbedding uses naver/splade-cocondenser-ensembledistil
+        instead of naver/splade-v3 because:
+
+        - splade-v3 is a gated model requiring Hugging Face authentication
+        - cocondenser-ensembledistil is publicly accessible
+        - Performance difference is minimal (~2%)
+        - Avoids "Access to model is restricted" errors
+
+        This allows all users to run tests without authentication setup.
+    """
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_init_success(self, mock_require_module):
+        """Test successful initialization.
+
+        Verifies that DefaultLocalSparseEmbedding initializes with the publicly
+        accessible naver/splade-cocondenser-ensembledistil model instead of
+        the gated naver/splade-v3 model.
+        """
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.device = "cpu"
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+
+        assert sparse_emb.model_name == "naver/splade-cocondenser-ensembledistil"
+        assert sparse_emb.model_source == "huggingface"
+        assert sparse_emb.device == "cpu"
+        mock_st.SentenceTransformer.assert_called_once_with(
+            "naver/splade-cocondenser-ensembledistil",
+            device=None,
+            trust_remote_code=True,
+        )
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_init_with_custom_device(self, mock_require_module):
+        """Test initialization with custom device."""
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.device = "cuda"
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding(device="cuda")
+
+        assert sparse_emb.device == "cuda"
+        mock_st.SentenceTransformer.assert_called_once_with(
+            "naver/splade-cocondenser-ensembledistil",
+            device="cuda",
+            trust_remote_code=True,
+        )
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_embed_success(self, mock_require_module):
+        """Test successful sparse embedding generation with official API."""
+        import numpy as np
+
+        # Clear model cache to ensure fresh mock
+        from zvec.extension.sentence_transformer_embedding_function import (
+            DefaultLocalSparseEmbedding,
+        )
+
+        DefaultLocalSparseEmbedding.clear_cache()
+
+        # Create a mock sparse matrix that simulates scipy.sparse behavior
+        # The code will call: sparse_matrix[0].toarray().flatten()
+        mock_sparse_matrix = Mock()
+
+        # Create a dense array representation with vocab_size=30522
+        vocab_size = 30522
+        dense_array = np.zeros(vocab_size)
+        # Set specific non-zero values at indices [10, 245, 1023, 5678]
+        dense_array[10] = 0.5
+        dense_array[245] = 0.8
+        dense_array[1023] = 1.2
+        dense_array[5678] = 0.3
+
+        # Mock the method chain: sparse_matrix[0].toarray().flatten()
+        mock_row = Mock()
+        mock_dense = Mock()
+        mock_row.toarray.return_value = mock_dense
+        mock_dense.flatten.return_value = dense_array
+        mock_sparse_matrix.__getitem__ = Mock(return_value=mock_row)
+
+        # Also mock hasattr check for 'toarray'
+        mock_sparse_matrix.toarray = Mock()
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.device = "cpu"
+
+        # Configure mock methods to return sparse matrix
+        # Must set return_value BEFORE hasattr() check in the code
+        mock_model.encode_query = Mock(return_value=mock_sparse_matrix)
+        mock_model.encode_document = Mock(return_value=mock_sparse_matrix)
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+        result = sparse_emb.embed("machine learning")
+
+        # Verify result is a dictionary
+        assert isinstance(result, dict)
+        # Verify keys are integers and values are floats
+        assert all(isinstance(k, int) for k in result.keys())
+        assert all(isinstance(v, float) for v in result.values())
+        # Verify all values are positive
+        assert all(v > 0 for v in result.values())
+        # Sparse vectors should have specific dimensions
+        assert len(result) == 4
+
+        # Verify output is sorted by indices (keys)
+        keys = list(result.keys())
+        assert keys == sorted(keys), (
+            "Sparse vector keys must be sorted in ascending order"
+        )
+
+        # Verify expected keys
+        assert keys == [10, 245, 1023, 5678]
+
+        # Verify encode_query was called with a list
+        mock_model.encode_query.assert_called_once()
+        call_args = mock_model.encode_query.call_args[0][0]
+        assert isinstance(call_args, list)
+        assert call_args == ["machine learning"]
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_embed_empty_input(self, mock_require_module):
+        """Test embedding with empty input."""
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+
+        with pytest.raises(ValueError, match="Input text cannot be empty"):
+            sparse_emb.embed("")
+
+        with pytest.raises(ValueError, match="Input text cannot be empty"):
+            sparse_emb.embed("   ")
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_embed_non_string_input(self, mock_require_module):
+        """Test embedding with non-string input."""
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+
+        with pytest.raises(TypeError, match="Expected 'input' to be str"):
+            sparse_emb.embed(123)
+
+        with pytest.raises(TypeError, match="Expected 'input' to be str"):
+            sparse_emb.embed(["text"])
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_callable_interface(self, mock_require_module):
+        """Test that DefaultSparseEmbedding is callable."""
+        import numpy as np
+
+        # Clear model cache
+        from zvec.extension.sentence_transformer_embedding_function import (
+            DefaultLocalSparseEmbedding,
+        )
+
+        DefaultLocalSparseEmbedding.clear_cache()
+
+        # Create a mock sparse matrix
+        mock_sparse_matrix = Mock()
+
+        # Create a dense array representation with vocab_size=30522
+        vocab_size = 30522
+        dense_array = np.zeros(vocab_size)
+        # Set specific non-zero values at indices [100, 200, 300]
+        dense_array[100] = 1.0
+        dense_array[200] = 0.5
+        dense_array[300] = 0.8
+
+        # Mock the method chain: sparse_matrix[0].toarray().flatten()
+        mock_row = Mock()
+        mock_dense = Mock()
+        mock_row.toarray.return_value = mock_dense
+        mock_dense.flatten.return_value = dense_array
+        mock_sparse_matrix.__getitem__ = Mock(return_value=mock_row)
+
+        # Also mock hasattr check for 'toarray'
+        mock_sparse_matrix.toarray = Mock()
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.device = "cpu"
+
+        # Configure mock methods
+        mock_model.encode_query = Mock(return_value=mock_sparse_matrix)
+        mock_model.encode_document = Mock(return_value=mock_sparse_matrix)
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+
+        # Test callable interface
+        result = sparse_emb("test input")
+        assert isinstance(result, dict)
+        assert all(isinstance(k, int) for k in result.keys())
+
+        # Verify sorted output
+        keys = list(result.keys())
+        assert keys == sorted(keys), "Callable interface must also return sorted keys"
+        assert keys == [100, 200, 300]
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_model_loading_failure(self, mock_require_module):
+        """Test handling of model loading failure."""
+        # Clear model cache to ensure the test actually tries to load the model
+        from zvec.extension.sentence_transformer_embedding_function import (
+            DefaultLocalSparseEmbedding,
+        )
+
+        DefaultLocalSparseEmbedding.clear_cache()
+
+        mock_st = Mock()
+        mock_st.SentenceTransformer.side_effect = Exception("Model not found")
+        mock_require_module.return_value = mock_st
+
+        with pytest.raises(
+            ValueError, match="Failed to load Sentence Transformer model"
+        ):
+            DefaultLocalSparseEmbedding()
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_inference_failure(self, mock_require_module):
+        """Test handling of inference failure."""
+        # Clear model cache
+        from zvec.extension.sentence_transformer_embedding_function import (
+            DefaultLocalSparseEmbedding,
+        )
+
+        DefaultLocalSparseEmbedding.clear_cache()
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.device = "cpu"
+
+        # Configure mock methods to raise RuntimeError
+        mock_model.encode_query = Mock(side_effect=RuntimeError("CUDA out of memory"))
+        mock_model.encode_document = Mock(
+            side_effect=RuntimeError("CUDA out of memory")
+        )
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+
+        with pytest.raises(RuntimeError, match="Failed to generate sparse embedding"):
+            sparse_emb.embed("test input")
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_sparse_vector_properties(self, mock_require_module):
+        """Test properties of sparse vectors (sparsity, non-zero values, sorted order)."""
+        import numpy as np
+
+        # Clear model cache
+        from zvec.extension.sentence_transformer_embedding_function import (
+            DefaultLocalSparseEmbedding,
+        )
+
+        DefaultLocalSparseEmbedding.clear_cache()
+
+        # Create a mock sparse matrix that simulates scipy.sparse behavior
+        # The code will call: sparse_matrix[0].toarray().flatten()
+        mock_sparse_matrix = Mock()
+
+        # Create a dense array representation with vocab_size=30522
+        vocab_size = 30522
+        dense_array = np.zeros(vocab_size)
+        # Set specific non-zero values at indices [50, 100, 200, 400, 500]
+        dense_array[50] = 3.0
+        dense_array[100] = 2.0
+        dense_array[200] = 1.5
+        dense_array[400] = 2.5
+        dense_array[500] = 1.8
+
+        # Mock the method chain: sparse_matrix[0].toarray().flatten()
+        mock_row = Mock()
+        mock_dense = Mock()
+        mock_row.toarray.return_value = mock_dense
+        mock_dense.flatten.return_value = dense_array
+        mock_sparse_matrix.__getitem__ = Mock(return_value=mock_row)
+
+        # Also mock hasattr check for 'toarray'
+        mock_sparse_matrix.toarray = Mock()
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.device = "cpu"
+
+        # Configure mock methods
+        mock_model.encode_query = Mock(return_value=mock_sparse_matrix)
+        mock_model.encode_document = Mock(return_value=mock_sparse_matrix)
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+        result = sparse_emb.embed("test")
+
+        # Verify sparsity: result should have much fewer dimensions than vocab_size
+        assert len(result) < vocab_size
+        # All values should be positive
+        assert all(v > 0 for v in result.values())
+
+        # Verify keys are sorted in ascending order
+        keys = list(result.keys())
+        assert keys == sorted(keys), "Sparse vector keys must be sorted"
+
+        # Verify the specific non-zero indices are present and sorted
+        # Expected order: [50, 100, 200, 400, 500] (sorted)
+        expected_keys = [50, 100, 200, 400, 500]
+        assert keys == expected_keys, f"Expected {expected_keys}, got {keys}"
+
+        # First key should be smallest
+        if len(result) > 0:
+            first_key = next(iter(result.keys()))
+            assert first_key == min(result.keys()), "First key must be the smallest"
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_output_sorted_by_indices(self, mock_require_module):
+        """Test that output dictionary is always sorted by indices (keys) in ascending order."""
+        import numpy as np
+
+        # Clear model cache
+        from zvec.extension.sentence_transformer_embedding_function import (
+            DefaultLocalSparseEmbedding,
+        )
+
+        DefaultLocalSparseEmbedding.clear_cache()
+
+        # Create sparse output with deliberately out-of-order indices
+        # Non-sequential indices: 9999, 5, 1234, 77, 500
+        mock_sparse_matrix = Mock()
+
+        # Create a dense array representation with vocab_size=30522
+        vocab_size = 30522
+        dense_array = np.zeros(vocab_size)
+        # Set specific non-zero values at out-of-order indices
+        dense_array[9999] = 1.5
+        dense_array[5] = 2.0
+        dense_array[1234] = 0.8
+        dense_array[77] = 3.2
+        dense_array[500] = 1.1
+
+        # Mock the method chain: sparse_matrix[0].toarray().flatten()
+        mock_row = Mock()
+        mock_dense = Mock()
+        mock_row.toarray.return_value = mock_dense
+        mock_dense.flatten.return_value = dense_array
+        mock_sparse_matrix.__getitem__ = Mock(return_value=mock_row)
+
+        # Also mock hasattr check for 'toarray'
+        mock_sparse_matrix.toarray = Mock()
+
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.device = "cpu"
+
+        # Configure mock methods
+        mock_model.encode_query = Mock(return_value=mock_sparse_matrix)
+        mock_model.encode_document = Mock(return_value=mock_sparse_matrix)
+
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+        result = sparse_emb.embed("test sorting")
+
+        # Extract keys from result
+        result_keys = list(result.keys())
+
+        # Verify keys are sorted
+        assert result_keys == sorted(result_keys), (
+            f"Keys must be sorted in ascending order. "
+            f"Got: {result_keys}, Expected: {sorted(result_keys)}"
+        )
+
+        # Verify expected keys are present and in correct order
+        # Expected sorted order: [5, 77, 500, 1234, 9999]
+        expected_sorted_keys = [5, 77, 500, 1234, 9999]
+        assert result_keys == expected_sorted_keys, (
+            f"All expected keys should be present in sorted order. "
+            f"Expected: {expected_sorted_keys}, Got: {result_keys}"
+        )
+
+        # Verify first and last keys
+        assert result_keys[0] == 5, "First key must be minimum"
+        assert result_keys[-1] == 9999, "Last key must be maximum"
+
+        # Verify iteration order matches sorted order
+        for i, (key, value) in enumerate(result.items()):
+            if i > 0:
+                prev_key = list(result.keys())[i - 1]
+                assert key > prev_key, (
+                    f"Key at position {i} must be greater than previous key"
+                )
+
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_device_property(self, mock_require_module):
+        """Test device property returns correct device."""
+        mock_st = Mock()
+        mock_model = Mock()
+        mock_model.device = "cuda"
+        mock_st.SentenceTransformer.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        sparse_emb = DefaultLocalSparseEmbedding(device="cuda")
+        assert sparse_emb.device == "cuda"
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test: requires ZVEC_RUN_INTEGRATION_TESTS=1 and model download",
+    )
+    @patch("zvec.extension.sentence_transformer_function.require_module")
+    def test_modelscope_source(self, mock_require_module):
+        """Test initialization with ModelScope source."""
+        mock_st = Mock()
+        mock_ms = Mock()
+        mock_model = Mock()
+        mock_model.device = "cpu"
+        mock_st.SentenceTransformer.return_value = mock_model
+
+        # Mock ModelScope snapshot_download
+        with patch(
+            "modelscope.hub.snapshot_download.snapshot_download",
+            return_value="/cache/splade-cocondenser",
+        ):
+            mock_require_module.side_effect = (
+                lambda m: mock_st if m == "sentence_transformers" else mock_ms
+            )
+
+            sparse_emb = DefaultLocalSparseEmbedding(model_source="modelscope")
+
+            assert sparse_emb.model_name == "naver/splade-cocondenser-ensembledistil"
+            assert sparse_emb.model_source == "modelscope"
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test: requires ZVEC_RUN_INTEGRATION_TESTS=1 and model download",
+    )
+    def test_integration_real_model(self):
+        """Integration test with real SPLADE model (requires model download).
+
+        This test uses naver/splade-cocondenser-ensembledistil instead of
+        naver/splade-v3 because splade-v3 requires Hugging Face authentication.
+        The cocondenser-ensembledistil model is publicly accessible and provides
+        comparable performance.
+
+        To run this test:
+            export ZVEC_RUN_INTEGRATION_TESTS=1
+            pytest tests/test_embedding.py::TestDefaultSparseEmbedding::test_integration_real_model -v
+
+        Note: First run will download ~100MB model from Hugging Face.
+
+        Alternative models:
+            If you have access to splade-v3, you can create a custom embedding
+            class following the example in DefaultSparseEmbedding docstring.
+        """
+        # Clear model cache to ensure fresh load
+        from zvec.extension.sentence_transformer_embedding_function import (
+            DefaultLocalSparseEmbedding,
+        )
+
+        DefaultLocalSparseEmbedding.clear_cache()
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+
+        # Test with real input
+        text = "machine learning and artificial intelligence"
+        result = sparse_emb.embed(text)
+
+        # Verify result structure
+        assert isinstance(result, dict)
+        assert len(result) > 0
+        assert all(isinstance(k, int) and k >= 0 for k in result.keys())
+        assert all(isinstance(v, float) and v > 0 for v in result.values())
+
+        # SPLADE typically produces 100-300 non-zero dimensions
+        assert 50 < len(result) < 500
+
+        # Verify keys are sorted in ascending order
+        keys = list(result.keys())
+        assert keys == sorted(keys), "Real model output must be sorted by indices"
+
+        # Test callable interface
+        result2 = sparse_emb(text)
+        assert result == result2
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test: requires ZVEC_RUN_INTEGRATION_TESTS=1",
+    )
+    def test_integration_multiple_inputs(self):
+        """Integration test with multiple different inputs."""
+        # Clear model cache
+        from zvec.extension.sentence_transformer_embedding_function import (
+            DefaultLocalSparseEmbedding,
+        )
+
+        DefaultLocalSparseEmbedding.clear_cache()
+
+        sparse_emb = DefaultLocalSparseEmbedding()
+
+        texts = [
+            "Hello, world!",
+            "Machine learning is fascinating",
+            "Python programming language",
+        ]
+
+        results = [sparse_emb.embed(text) for text in texts]
+
+        # All results should be different
+        assert len(results) == 3
+        assert all(isinstance(r, dict) for r in results)
+
+        # Different inputs should produce different sparse vectors
+        assert results[0] != results[1]
+        assert results[1] != results[2]
+
+        # All results must be sorted by indices
+        for i, result in enumerate(results):
+            keys = list(result.keys())
+            assert keys == sorted(keys), f"Result {i} must have sorted keys"
+
+
+# ----------------------------
+# BM25EmbeddingFunction Test Case
+# ----------------------------
+class TestBM25EmbeddingFunction:
+    """Test suite for BM25EmbeddingFunction (BM25-based sparse embedding using DashText SDK)."""
+
+    def test_init_with_built_in_encoder(self):
+        """Test successful initialization with built-in encoder (no corpus)."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            # Test with default language (Chinese)
+            bm25 = BM25EmbeddingFunction()
+
+            assert bm25.corpus_size == 0
+            assert bm25.encoding_type == "query"
+            assert bm25.language == "zh"
+            mock_dashtext.SparseVectorEncoder.default.assert_called_once_with(name="zh")
+
+    def test_init_with_custom_encoder(self):
+        """Test successful initialization with custom encoder (with corpus)."""
+        corpus = [
+            "a cat is a feline and likes to purr",
+            "a dog is the human's best friend",
+            "a bird is a beautiful animal that can fly",
+        ]
+
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+            mock_dashtext.SparseVectorEncoder.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            bm25 = BM25EmbeddingFunction(corpus=corpus, b=0.75, k1=1.2)
+
+            assert bm25.corpus_size == 3
+            assert bm25.encoding_type == "query"
+            mock_dashtext.SparseVectorEncoder.assert_called_once_with(b=0.75, k1=1.2)
+            mock_encoder.train.assert_called_once_with(corpus)
+
+    def test_init_with_empty_corpus(self):
+        """Test initialization with empty corpus raises ValueError."""
+        with pytest.raises(ValueError, match="Corpus must be a non-empty list"):
+            BM25EmbeddingFunction(corpus=[])
+
+    def test_init_with_invalid_corpus(self):
+        """Test initialization with invalid corpus elements."""
+        with pytest.raises(ValueError, match="All corpus documents must be strings"):
+            BM25EmbeddingFunction(corpus=["text", 123, "another"])
+
+        with pytest.raises(ValueError, match="All corpus documents must be strings"):
+            BM25EmbeddingFunction(corpus=[None, "text"])
+
+    def test_init_with_language_parameter(self):
+        """Test initialization with different language settings."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            # Test English language
+            bm25_en = BM25EmbeddingFunction(language="en")
+            assert bm25_en.language == "en"
+            mock_dashtext.SparseVectorEncoder.default.assert_called_with(name="en")
+
+    def test_init_with_encoding_type(self):
+        """Test initialization with different encoding types."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            # Test document encoding type
+            bm25_doc = BM25EmbeddingFunction(encoding_type="document")
+            assert bm25_doc.encoding_type == "document"
+
+    def test_init_with_missing_dashtext_library(self):
+        """Test initialization fails when dashtext library is not installed."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_require.side_effect = ImportError("dashtext package is required")
+
+            with pytest.raises(ImportError, match="dashtext package is required"):
+                BM25EmbeddingFunction()
+
+    def test_embed_with_query_encoding(self):
+        """Test successful sparse embedding generation with query encoding."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+
+            # Mock encode_queries to return sparse vector
+            mock_encoder.encode_queries.return_value = {
+                5: 0.89,
+                12: 1.45,
+                23: 0.67,
+                45: 1.12,
+            }
+
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            bm25 = BM25EmbeddingFunction(encoding_type="query")
+            # Clear LRU cache to ensure fresh call
+            bm25.embed.cache_clear()
+            result = bm25.embed("cat purr loud")
+
+            # Verify result structure
+            assert isinstance(result, dict)
+            assert all(isinstance(k, int) for k in result.keys())
+            assert all(isinstance(v, float) for v in result.values())
+
+            # Verify all values are positive
+            assert all(v > 0 for v in result.values())
+
+            # Verify output is sorted by indices
+            keys = list(result.keys())
+            assert keys == sorted(keys), "Output must be sorted by indices"
+
+            # Verify expected keys from mock response
+            assert result == {5: 0.89, 12: 1.45, 23: 0.67, 45: 1.12}
+
+            # Verify encode_queries was called
+            mock_encoder.encode_queries.assert_called_once_with("cat purr loud")
+
+    def test_embed_with_document_encoding(self):
+        """Test successful sparse embedding generation with document encoding."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+
+            # Mock encode_documents to return sparse vector
+            mock_encoder.encode_documents.return_value = {10: 1.5, 20: 2.3}
+
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            bm25 = BM25EmbeddingFunction(encoding_type="document")
+            bm25.embed.cache_clear()
+            result = bm25.embed("document text")
+
+            assert result == {10: 1.5, 20: 2.3}
+            mock_encoder.encode_documents.assert_called_once_with("document text")
+
+    def test_embed_with_empty_input(self):
+        """Test embedding with empty input raises ValueError."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            bm25 = BM25EmbeddingFunction()
+
+            with pytest.raises(ValueError, match="Input text cannot be empty"):
+                bm25.embed("")
+
+            with pytest.raises(ValueError, match="Input text cannot be empty"):
+                bm25.embed("   ")
+
+    def test_embed_with_non_string_input(self):
+        """Test embedding with non-string input raises TypeError."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            bm25 = BM25EmbeddingFunction()
+
+            # Test with hashable non-string types - should get our custom error message
+            with pytest.raises(TypeError, match="Expected 'input' to be str"):
+                bm25.embed(123)
+
+            with pytest.raises(TypeError, match="Expected 'input' to be str"):
+                bm25.embed(None)
+
+            # Test with unhashable type (list)
+            # Note: lru_cache raises TypeError("unhashable type: 'list'") before our type check
+            # This is still a valid type error, just caught at a different layer
+            with pytest.raises(TypeError, match="unhashable type"):
+                bm25.embed(["text"])
+
+    def test_embed_callable_interface(self):
+        """Test that BM25EmbeddingFunction is callable."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+            mock_encoder.encode_queries.return_value = {10: 1.5}
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            bm25 = BM25EmbeddingFunction()
+            bm25.embed.cache_clear()
+
+            # Test callable interface
+            result = bm25("test query")
+            assert isinstance(result, dict)
+            assert 10 in result
+
+    def test_embed_output_sorted_by_indices(self):
+        """Test that output is always sorted by indices in ascending order."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+
+            # Mock encode_queries with unsorted indices
+            mock_encoder.encode_queries.return_value = {
+                9999: 1.5,
+                5: 2.0,
+                1234: 0.8,
+                77: 3.2,
+                500: 1.1,
+            }
+
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            bm25 = BM25EmbeddingFunction()
+            bm25.embed.cache_clear()
+            result = bm25.embed("test query")
+
+            # Verify keys are sorted
+            result_keys = list(result.keys())
+            assert result_keys == sorted(result_keys), (
+                f"Keys must be sorted. Got: {result_keys}, Expected: {sorted(result_keys)}"
+            )
+
+            # Verify expected sorted order: [5, 77, 500, 1234, 9999]
+            expected_keys = [5, 77, 500, 1234, 9999]
+            assert result_keys == expected_keys
+
+    def test_embed_filters_zero_values(self):
+        """Test that zero and negative values are filtered out."""
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+
+            # Mock encode_queries with zero and negative values
+            mock_encoder.encode_queries.return_value = {
+                0: 1.5,  # Positive - should be included
+                1: 0.0,  # Zero - should be filtered
+                2: -0.5,  # Negative - should be filtered
+            }
+
+            mock_dashtext.SparseVectorEncoder.default.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            bm25 = BM25EmbeddingFunction()
+            bm25.embed.cache_clear()
+            result = bm25.embed("test")
+
+            # Only positive token should be in result
+            assert 0 in result
+            assert 1 not in result  # Zero value filtered
+            assert 2 not in result  # Negative value filtered
+            assert all(v > 0 for v in result.values())
+
+    def test_properties(self):
+        """Test property accessors."""
+        corpus = ["doc1", "doc2", "doc3"]
+
+        with patch(
+            "zvec.extension.bm25_embedding_function.require_module"
+        ) as mock_require:
+            mock_dashtext = Mock()
+            mock_encoder = Mock()
+            mock_dashtext.SparseVectorEncoder.return_value = mock_encoder
+            mock_require.return_value = mock_dashtext
+
+            bm25 = BM25EmbeddingFunction(
+                corpus=corpus,
+                encoding_type="document",
+                language="en",
+                b=0.8,
+                k1=1.5,
+                custom_param="test",
+            )
+
+            assert bm25.corpus_size == 3
+            assert bm25.encoding_type == "document"
+            assert bm25.language == "en"
+            assert bm25.extra_params == {"custom_param": "test"}
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    def test_real_dashtext_bm25_embedding(self):
+        """Integration test with real DashText library.
+
+        To run this test:
+            export ZVEC_RUN_INTEGRATION_TESTS=1
+            pip install dashtext
+
+        Note: This test requires the dashtext package to be installed.
+        """
+        # Test built-in encoder (Chinese)
+        bm25_zh = BM25EmbeddingFunction(language="zh", encoding_type="query")
+
+        query_zh = "什么是向量检索服务"
+        result_zh = bm25_zh.embed(query_zh)
+
+        assert isinstance(result_zh, dict)
+        assert len(result_zh) > 0
+        assert all(isinstance(k, int) for k in result_zh.keys())
+        assert all(isinstance(v, float) and v > 0 for v in result_zh.values())
+
+        # Verify sorted output
+        keys = list(result_zh.keys())
+        assert keys == sorted(keys), "Real DashText BM25 output must be sorted"
+
+        # Test custom corpus
+        corpus = [
+            "The cat sits on the mat",
+            "The dog plays in the garden",
+            "Birds fly in the sky",
+            "Fish swim in the water",
+        ]
+
+        bm25_custom = BM25EmbeddingFunction(corpus=corpus, encoding_type="query")
+
+        query_en = "cat on mat"
+        result_en = bm25_custom.embed(query_en)
+
+        assert isinstance(result_en, dict)
+        assert len(result_en) > 0
+        assert all(isinstance(k, int) for k in result_en.keys())
+        assert all(isinstance(v, float) and v > 0 for v in result_en.values())
+
+        # Test callable interface
+        result2 = bm25_custom(query_en)
+        assert result_en == result2
+
+        # Verify properties
+        assert bm25_custom.corpus_size == 4
diff --git a/python/tests/test_reranker.py b/python/tests/test_reranker.py
index 5b2c177d..dced1dd7 100644
--- a/python/tests/test_reranker.py
+++ b/python/tests/test_reranker.py
@@ -13,11 +13,23 @@
 # limitations under the License.
 from __future__ import annotations
 
-from unittest.mock import patch
+from unittest.mock import patch, MagicMock
 import pytest
 import math
+import os
 
-from zvec import RrfReRanker, WeightedReRanker, Doc, MetricType
+from zvec import Doc, MetricType
+from zvec.extension.multi_vector_reranker import (
+    RrfReRanker,
+    WeightedReRanker,
+)
+from zvec.extension.sentence_transformer_rerank_function import (
+    DefaultLocalReRanker,
+)
+from zvec.extension.qwen_rerank_function import QwenReRanker
+
+# Set ZVEC_RUN_INTEGRATION_TESTS=1 to run real API tests
+RUN_INTEGRATION_TESTS = os.environ.get("ZVEC_RUN_INTEGRATION_TESTS", "0") == "1"
 
 
 # ----------------------------
@@ -25,23 +37,20 @@
 # ----------------------------
 class TestRrfReRanker:
     def test_init(self):
-        reranker = RrfReRanker(
-            query="test", topn=5, rerank_field="content", rank_constant=100
-        )
-        assert reranker.query == "test"
+        reranker = RrfReRanker(topn=5, rerank_field="content", rank_constant=100)
         assert reranker.topn == 5
         assert reranker.rerank_field == "content"
         assert reranker.rank_constant == 100
 
     def test_rrf_score(self):
-        reranker = RrfReRanker(query="test", rank_constant=60)
+        reranker = RrfReRanker(rank_constant=60)
         # 根据公式 1.0 / (k + rank + 1)，其中k=60
         assert reranker._rrf_score(0) == 1.0 / (60 + 0 + 1)
         assert reranker._rrf_score(1) == 1.0 / (60 + 1 + 1)
         assert reranker._rrf_score(10) == 1.0 / (60 + 10 + 1)
 
     def test_rerank(self):
-        reranker = RrfReRanker(query="test", topn=3)
+        reranker = RrfReRanker(topn=3)
 
         doc1 = Doc(id="1", score=0.8)
         doc2 = Doc(id="2", score=0.7)
@@ -68,20 +77,18 @@ class TestWeightedReRanker:
     def test_init(self):
         weights = {"vector1": 0.7, "vector2": 0.3}
         reranker = WeightedReRanker(
-            query="test",
             topn=5,
             rerank_field="content",
             metric=MetricType.L2,
             weights=weights,
         )
-        assert reranker.query == "test"
         assert reranker.topn == 5
         assert reranker.rerank_field == "content"
         assert reranker.metric == MetricType.L2
         assert reranker.weights == weights
 
     def test_normalize_score(self):
-        reranker = WeightedReRanker(query="test")
+        reranker = WeightedReRanker()
 
         score = reranker._normalize_score(1.0, MetricType.L2)
         expected = 1.0 - 2 * math.atan(1.0) / math.pi
@@ -100,9 +107,7 @@ def test_normalize_score(self):
 
     def test_rerank(self):
         weights = {"vector1": 0.7, "vector2": 0.3}
-        reranker = WeightedReRanker(
-            query="test", topn=3, weights=weights, metric=MetricType.L2
-        )
+        reranker = WeightedReRanker(topn=3, weights=weights, metric=MetricType.L2)
 
         doc1 = Doc(id="1", score=0.8)
         doc2 = Doc(id="2", score=0.7)
@@ -121,64 +126,843 @@ def test_rerank(self):
         assert scores == sorted(scores, reverse=True)
 
 
-# # ----------------------------
-# # QwenReRanker Test Case
-# # ----------------------------
-# class TestQwenReRanker:
-#     def test_init_without_query(self):
-#         with pytest.raises(ValueError):
-#             QwenReRanker()
-#
-#     def test_init_without_api_key(self):
-#         with patch.dict(os.environ, {"DASHSCOPE_API_KEY": ""}):
-#             with pytest.raises(ValueError, match="DashScope API key is required"):
-#                 QwenReRanker(query="test")
-#
-#     @patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"})
-#     def test_init_with_env_api_key(self):
-#         reranker = QwenReRanker(query="test")
-#         assert reranker.query == "test"
-#         assert reranker._api_key == "test_key"
-#
-#     def test_model_property(self):
-#         reranker = QwenReRanker(query="test", api_key="test_key")
-#         assert reranker.model == "gte-rerank-v2"
-#
-#         reranker = QwenReRanker(query="test", model="custom-model", api_key="test_key")
-#         assert reranker.model == "custom-model"
-#
-#     def test_rerank_empty_results(self):
-#         reranker = QwenReRanker(query="test", api_key="test_key")
-#         results = reranker.rerank({})
-#         assert results == []
-#
-#     def test_rerank_no_documents(self):
-#         reranker = QwenReRanker(query="test", api_key="test_key")
-#         query_results = {"vector1": [Doc(id="1")]}
-#         with pytest.raises(ValueError, match="No documents to rerank"):
-#             reranker.rerank(query_results)
-#
-#     @pytest.mark.skip(reason="Qwen ReRanker is not available in CI")
-#     def test_rerank_success(self):
-#         reranker = QwenReRanker(
-#             topn=3,
-#             query="test",
-#             api_key="*",
-#             rerank_field="content",
-#         )
-#         query_results = {
-#             "vector1": [
-#                 Doc(id="1", fields={"content": "This is a test document."}),
-#                 Doc(id="2", fields={"content": "Another test document."}),
-#                 Doc(id="3", fields={"content": "Yet another test document."}),
-#                 Doc(id="4", fields={"content": "One more test document."}),
-#             ],
-#             "vector2": [
-#                 Doc(id="5", fields={"content": "This is a test document2."}),
-#                 Doc(id="6", fields={"content": "Another test document2."}),
-#                 Doc(id="7", fields={"content": "Yet another test document2."}),
-#                 Doc(id="8", fields={"content": "One more test document2."}),
-#             ],
-#         }
-#         results = reranker.rerank(query_results)
-#         assert len(results) == 3
+# ----------------------------
+# QwenReRanker Test Case
+# ----------------------------
+class TestQwenReRanker:
+    def test_init_without_query(self):
+        with pytest.raises(ValueError, match="Query is required for QwenReRanker"):
+            QwenReRanker(api_key="test_key")
+
+    def test_init_without_api_key(self):
+        with patch.dict(os.environ, {}, clear=True):
+            with pytest.raises(ValueError, match="DashScope API key is required"):
+                QwenReRanker(query="test")
+
+    @patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"})
+    def test_init_with_env_api_key(self):
+        reranker = QwenReRanker(query="test", rerank_field="content")
+        assert reranker.query == "test"
+        assert reranker._api_key == "test_key"
+        assert reranker.rerank_field == "content"
+
+    def test_init_with_explicit_api_key(self):
+        reranker = QwenReRanker(
+            query="test", api_key="explicit_key", rerank_field="content"
+        )
+        assert reranker.query == "test"
+        assert reranker._api_key == "explicit_key"
+
+    def test_model_property(self):
+        reranker = QwenReRanker(
+            query="test", api_key="test_key", rerank_field="content"
+        )
+        assert reranker.model == "gte-rerank-v2"
+
+        reranker = QwenReRanker(
+            query="test",
+            model="custom-model",
+            api_key="test_key",
+            rerank_field="content",
+        )
+        assert reranker.model == "custom-model"
+
+    def test_query_property(self):
+        reranker = QwenReRanker(
+            query="test query", api_key="test_key", rerank_field="content"
+        )
+        assert reranker.query == "test query"
+
+    def test_topn_property(self):
+        reranker = QwenReRanker(
+            query="test", topn=5, api_key="test_key", rerank_field="content"
+        )
+        assert reranker.topn == 5
+
+    def test_rerank_field_property(self):
+        reranker = QwenReRanker(query="test", api_key="test_key", rerank_field="title")
+        assert reranker.rerank_field == "title"
+
+    def test_rerank_empty_results(self):
+        reranker = QwenReRanker(
+            query="test", api_key="test_key", rerank_field="content"
+        )
+        results = reranker.rerank({})
+        assert results == []
+
+    def test_rerank_no_valid_documents(self):
+        reranker = QwenReRanker(
+            query="test", api_key="test_key", rerank_field="content"
+        )
+        # Document without the rerank_field
+        query_results = {"vector1": [Doc(id="1")]}
+        with pytest.raises(ValueError, match="No documents to rerank"):
+            reranker.rerank(query_results)
+
+    def test_rerank_skip_empty_content(self):
+        reranker = QwenReRanker(
+            query="test", api_key="test_key", rerank_field="content"
+        )
+        query_results = {
+            "vector1": [
+                Doc(id="1", fields={"content": ""}),
+                Doc(id="2", fields={"content": "   "}),
+            ]
+        }
+        with pytest.raises(ValueError, match="No documents to rerank"):
+            reranker.rerank(query_results)
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_rerank_success(self, mock_require_module):
+        # Mock dashscope module
+        mock_dashscope = MagicMock()
+        mock_require_module.return_value = mock_dashscope
+
+        # Mock API response
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.output = {
+            "results": [
+                {"index": 0, "relevance_score": 0.95},
+                {"index": 1, "relevance_score": 0.85},
+            ]
+        }
+        mock_dashscope.TextReRank.call.return_value = mock_response
+
+        reranker = QwenReRanker(
+            query="test query", topn=2, api_key="test_key", rerank_field="content"
+        )
+
+        query_results = {
+            "vector1": [
+                Doc(id="1", fields={"content": "Document 1"}),
+                Doc(id="2", fields={"content": "Document 2"}),
+            ]
+        }
+
+        results = reranker.rerank(query_results)
+
+        assert len(results) == 2
+        assert results[0].id == "1"
+        assert results[0].score == 0.95
+        assert results[1].id == "2"
+        assert results[1].score == 0.85
+
+        # Verify API call
+        mock_dashscope.TextReRank.call.assert_called_once_with(
+            model="gte-rerank-v2",
+            query="test query",
+            documents=["Document 1", "Document 2"],
+            top_n=2,
+            return_documents=False,
+        )
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_rerank_deduplicate_documents(self, mock_require_module):
+        # Mock dashscope module
+        mock_dashscope = MagicMock()
+        mock_require_module.return_value = mock_dashscope
+
+        # Mock API response
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.output = {
+            "results": [
+                {"index": 0, "relevance_score": 0.9},
+            ]
+        }
+        mock_dashscope.TextReRank.call.return_value = mock_response
+
+        reranker = QwenReRanker(
+            query="test", topn=5, api_key="test_key", rerank_field="content"
+        )
+
+        # Same document in multiple vector results
+        doc1 = Doc(id="1", fields={"content": "Document 1"})
+        query_results = {"vector1": [doc1], "vector2": [doc1]}
+
+        results = reranker.rerank(query_results)
+
+        # Should only call API with document once
+        call_args = mock_dashscope.TextReRank.call.call_args
+        assert len(call_args[1]["documents"]) == 1
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_rerank_api_error(self, mock_require_module):
+        # Mock dashscope module
+        mock_dashscope = MagicMock()
+        mock_require_module.return_value = mock_dashscope
+
+        # Mock API error response
+        mock_response = MagicMock()
+        mock_response.status_code = 400
+        mock_response.message = "Invalid request"
+        mock_response.code = "InvalidParameter"
+        mock_dashscope.TextReRank.call.return_value = mock_response
+
+        reranker = QwenReRanker(
+            query="test", api_key="test_key", rerank_field="content"
+        )
+
+        query_results = {"vector1": [Doc(id="1", fields={"content": "Document 1"})]}
+
+        with pytest.raises(ValueError, match="DashScope API error"):
+            reranker.rerank(query_results)
+
+    @patch("zvec.extension.qwen_function.require_module")
+    def test_rerank_runtime_error(self, mock_require_module):
+        # Mock dashscope module that raises exception
+        mock_dashscope = MagicMock()
+        mock_require_module.return_value = mock_dashscope
+        mock_dashscope.TextReRank.call.side_effect = Exception("Network error")
+
+        reranker = QwenReRanker(
+            query="test", api_key="test_key", rerank_field="content"
+        )
+
+        query_results = {"vector1": [Doc(id="1", fields={"content": "Document 1"})]}
+
+        with pytest.raises(RuntimeError, match="Failed to call DashScope API"):
+            reranker.rerank(query_results)
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    def test_real_qwen_rerank(self):
+        """Integration test with real DashScope TextReRank API.
+
+        To run this test, set environment variables:
+            export ZVEC_RUN_INTEGRATION_TESTS=1
+            export DASHSCOPE_API_KEY=your-api-key
+        """
+        # Create reranker with real API
+        reranker = QwenReRanker(
+            query="What is machine learning?",
+            topn=3,
+            rerank_field="content",
+            model="gte-rerank-v2",
+        )
+
+        # Prepare test documents
+        query_results = {
+            "vector1": [
+                Doc(
+                    id="1",
+                    score=0.8,
+                    fields={
+                        "content": "Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from data."
+                    },
+                ),
+                Doc(
+                    id="2",
+                    score=0.7,
+                    fields={
+                        "content": "The weather is nice today with clear skies and sunshine."
+                    },
+                ),
+                Doc(
+                    id="3",
+                    score=0.75,
+                    fields={
+                        "content": "Deep learning is a specialized branch of machine learning using neural networks with multiple layers."
+                    },
+                ),
+            ],
+            "vector2": [
+                Doc(
+                    id="4",
+                    score=0.6,
+                    fields={
+                        "content": "Python is a popular programming language for data science and machine learning applications."
+                    },
+                ),
+                Doc(
+                    id="5",
+                    score=0.65,
+                    fields={
+                        "content": "A recipe for chocolate cake includes flour, sugar, eggs, and cocoa powder."
+                    },
+                ),
+            ],
+        }
+
+        # Call real API
+        results = reranker.rerank(query_results)
+
+        # Verify results
+        assert len(results) <= 3, "Should return at most topn documents"
+        assert len(results) > 0, "Should return at least one document"
+
+        # All results should have valid scores
+        for doc in results:
+            assert hasattr(doc, "score"), "Each document should have a score"
+            assert isinstance(doc.score, (int, float)), "Score should be numeric"
+            assert doc.score > 0, "Score should be positive"
+
+        # Verify scores are in descending order
+        scores = [doc.score for doc in results]
+        assert scores == sorted(scores, reverse=True), (
+            "Results should be sorted by score in descending order"
+        )
+
+        # Verify relevant documents are ranked higher
+        # Document 1 and 3 are about machine learning, should rank higher than weather/recipe docs
+        result_ids = [doc.id for doc in results]
+
+        # At least one of the ML-related documents should be in top results
+        ml_related_docs = {"1", "3", "4"}
+        assert any(doc_id in ml_related_docs for doc_id in result_ids[:2]), (
+            "ML-related documents should rank higher"
+        )
+
+        # Print results for manual verification (useful during development)
+        print("\nReranking results:")
+        for i, doc in enumerate(results, 1):
+            print(f"{i}. ID={doc.id}, Score={doc.score:.4f}")
+            if doc.fields:
+                content = doc.field("content")
+                if content:
+                    print(f"   Content: {content[:80]}...")
+
+
+# ----------------------------
+# DefaultLocalReRanker Test Case
+# ----------------------------
+class TestDefaultLocalReRanker:
+    """Test cases for DefaultLocalReRanker."""
+
+    def test_init_without_query(self):
+        """Test initialization fails without query."""
+        with pytest.raises(
+            ValueError, match="Query is required for DefaultLocalReRanker"
+        ):
+            DefaultLocalReRanker(rerank_field="content")
+
+    def test_init_with_empty_query(self):
+        """Test initialization fails with empty query."""
+        with pytest.raises(
+            ValueError, match="Query is required for DefaultLocalReRanker"
+        ):
+            DefaultLocalReRanker(query="", rerank_field="content")
+
+    @patch("zvec.extension.sentence_transformer_rerank_function.require_module")
+    def test_init_success(self, mock_require_module):
+        """Test successful initialization with mocked model."""
+        # Mock sentence_transformers module
+        mock_st = MagicMock()
+        mock_model = MagicMock()
+        mock_model.predict = MagicMock()  # Cross-encoder has predict method
+        mock_model.device = "cpu"
+        mock_st.CrossEncoder.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        reranker = DefaultLocalReRanker(
+            query="test query",
+            topn=5,
+            rerank_field="content",
+            model_name="cross-encoder/ms-marco-MiniLM-L6-v2",
+        )
+
+        assert reranker.query == "test query"
+        assert reranker.topn == 5
+        assert reranker.rerank_field == "content"
+        assert reranker.model_name == "cross-encoder/ms-marco-MiniLM-L6-v2"
+        assert reranker.model_source == "huggingface"
+        assert reranker.batch_size == 32
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    @patch("zvec.extension.sentence_transformer_rerank_function.require_module")
+    def test_init_with_custom_params(self, mock_require_module):
+        """Test initialization with custom parameters."""
+        mock_st = MagicMock()
+        mock_model = MagicMock()
+        mock_model.predict = MagicMock()
+        mock_model.device = "cuda"
+        mock_st.CrossEncoder.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        reranker = DefaultLocalReRanker(
+            query="custom query",
+            topn=10,
+            rerank_field="title",
+            model_name="cross-encoder/ms-marco-MiniLM-L12-v2",
+            model_source="modelscope",
+            device="cuda",
+            batch_size=64,
+        )
+
+        assert reranker.query == "custom query"
+        assert reranker.topn == 10
+        assert reranker.rerank_field == "title"
+        assert reranker.model_name == "cross-encoder/ms-marco-MiniLM-L12-v2"
+        assert reranker.model_source == "modelscope"
+        assert reranker.batch_size == 64
+
+    @patch("zvec.extension.sentence_transformer_rerank_function.require_module")
+    def test_init_invalid_model(self, mock_require_module):
+        """Test initialization fails with non-cross-encoder model."""
+        # Mock a model without predict method (not a cross-encoder)
+        mock_st = MagicMock()
+        mock_model = MagicMock(spec=[])  # No predict method
+        mock_st.CrossEncoder.return_value = mock_model
+        mock_require_module.return_value = mock_st
+
+        with pytest.raises(ValueError, match="does not appear to be a cross-encoder"):
+            DefaultLocalReRanker(query="test", rerank_field="content")
+
+    def test_query_property(self):
+        """Test query property."""
+        mock_model = MagicMock()
+        mock_model.predict = MagicMock()
+
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(query="test query", rerank_field="content")
+            assert reranker.query == "test query"
+
+    def test_topn_property(self):
+        """Test topn property."""
+        mock_model = MagicMock()
+        mock_model.predict = MagicMock()
+
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(
+                query="test", topn=15, rerank_field="content"
+            )
+            assert reranker.topn == 15
+
+    def test_rerank_field_property(self):
+        """Test rerank_field property."""
+        mock_model = MagicMock()
+        mock_model.predict = MagicMock()
+
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(query="test", rerank_field="title")
+            assert reranker.rerank_field == "title"
+
+    def test_batch_size_property(self):
+        """Test batch_size property."""
+        mock_model = MagicMock()
+        mock_model.predict = MagicMock()
+
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(
+                query="test", rerank_field="content", batch_size=128
+            )
+            assert reranker.batch_size == 128
+
+    def test_rerank_empty_results(self):
+        """Test rerank with empty query_results."""
+        mock_model = MagicMock()
+        mock_model.predict = MagicMock()
+
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(query="test", rerank_field="content")
+            results = reranker.rerank({})
+            assert results == []
+
+    def test_rerank_no_valid_documents(self):
+        """Test rerank with documents missing rerank_field."""
+        mock_model = MagicMock()
+        mock_model.predict = MagicMock()
+
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(query="test", rerank_field="content")
+
+            # Document without the rerank_field
+            query_results = {"vector1": [Doc(id="1")]}
+            with pytest.raises(ValueError, match="No documents to rerank"):
+                reranker.rerank(query_results)
+
+    def test_rerank_skip_empty_content(self):
+        """Test rerank skips documents with empty content."""
+        mock_model = MagicMock()
+        mock_model.predict = MagicMock()
+
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(query="test", rerank_field="content")
+
+            query_results = {
+                "vector1": [
+                    Doc(id="1", fields={"content": ""}),
+                    Doc(id="2", fields={"content": "   "}),
+                ]
+            }
+            with pytest.raises(ValueError, match="No documents to rerank"):
+                reranker.rerank(query_results)
+
+    def test_rerank_success(self):
+        """Test successful rerank with mocked model."""
+        # Mock standard cross-encoder model
+        mock_model = MagicMock()
+
+        # Mock predict method to return scores
+        import numpy as np
+
+        mock_scores = np.array([0.95, 0.85, 0.75])
+        mock_model.predict.return_value = mock_scores
+        mock_model.device = "cpu"
+
+        # Mock sentence_transformers module
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(
+                query="test query", topn=3, rerank_field="content"
+            )
+
+            query_results = {
+                "vector1": [
+                    Doc(id="1", score=0.8, fields={"content": "Document 1"}),
+                    Doc(id="2", score=0.7, fields={"content": "Document 2"}),
+                    Doc(id="3", score=0.6, fields={"content": "Document 3"}),
+                ]
+            }
+
+            results = reranker.rerank(query_results)
+
+            # Verify results
+            assert len(results) == 3
+            assert results[0].id == "1"
+            assert results[0].score == 0.95
+            assert results[1].id == "2"
+            assert results[1].score == 0.85
+            assert results[2].id == "3"
+            assert results[2].score == 0.75
+
+            # Verify model.predict was called correctly
+            assert mock_model.predict.called
+            call_args = mock_model.predict.call_args
+            pairs = call_args[0][0]
+            assert len(pairs) == 3
+            assert pairs[0] == ["test query", "Document 1"]
+            assert pairs[1] == ["test query", "Document 2"]
+            assert pairs[2] == ["test query", "Document 3"]
+            assert call_args[1]["batch_size"] == 32
+            assert call_args[1]["show_progress_bar"] is False
+
+    def test_rerank_with_topn_limit(self):
+        """Test rerank respects topn limit."""
+        mock_model = MagicMock()
+
+        import numpy as np
+
+        mock_scores = np.array([0.9, 0.8, 0.7, 0.6, 0.5])
+        mock_model.predict.return_value = mock_scores
+
+        # Mock sentence_transformers module
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(
+                query="test", topn=2, rerank_field="content"
+            )
+
+            query_results = {
+                "vector1": [
+                    Doc(id="1", fields={"content": "Doc 1"}),
+                    Doc(id="2", fields={"content": "Doc 2"}),
+                    Doc(id="3", fields={"content": "Doc 3"}),
+                    Doc(id="4", fields={"content": "Doc 4"}),
+                    Doc(id="5", fields={"content": "Doc 5"}),
+                ]
+            }
+
+            results = reranker.rerank(query_results)
+
+            # Should only return top 2
+            assert len(results) == 2
+            assert results[0].id == "1"
+            assert results[0].score == 0.9
+            assert results[1].id == "2"
+            assert results[1].score == 0.8
+
+    def test_rerank_deduplicate_documents(self):
+        """Test rerank deduplicates documents across multiple vectors."""
+        mock_model = MagicMock()
+
+        import numpy as np
+
+        mock_scores = np.array([0.95, 0.85])
+        mock_model.predict.return_value = mock_scores
+
+        # Mock sentence_transformers module
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(
+                query="test", topn=5, rerank_field="content"
+            )
+
+            # Same document in multiple vector results
+            doc1 = Doc(id="1", fields={"content": "Document 1"})
+            doc2 = Doc(id="2", fields={"content": "Document 2"})
+
+            query_results = {
+                "vector1": [doc1, doc2],
+                "vector2": [doc1],  # doc1 appears in both
+            }
+
+            results = reranker.rerank(query_results)
+
+            # Should only process each document once
+            assert len(results) == 2
+            assert mock_model.predict.call_count == 1
+
+            call_args = mock_model.predict.call_args
+            pairs = call_args[0][0]
+            assert len(pairs) == 2  # Only 2 unique documents
+
+    def test_rerank_sorting(self):
+        """Test rerank sorts documents by score in descending order."""
+        mock_model = MagicMock()
+
+        import numpy as np
+
+        # Return scores in non-sorted order
+        mock_scores = np.array([0.6, 0.9, 0.7])
+        mock_model.predict.return_value = mock_scores
+
+        # Mock sentence_transformers module
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(
+                query="test", topn=3, rerank_field="content"
+            )
+
+            query_results = {
+                "vector1": [
+                    Doc(id="1", fields={"content": "Doc 1"}),
+                    Doc(id="2", fields={"content": "Doc 2"}),
+                    Doc(id="3", fields={"content": "Doc 3"}),
+                ]
+            }
+
+            results = reranker.rerank(query_results)
+
+            # Should be sorted by score (descending)
+            assert len(results) == 3
+            assert results[0].id == "2"  # score 0.9
+            assert results[0].score == 0.9
+            assert results[1].id == "3"  # score 0.7
+            assert results[1].score == 0.7
+            assert results[2].id == "1"  # score 0.6
+            assert results[2].score == 0.6
+
+    def test_rerank_model_error(self):
+        """Test rerank handles model prediction errors."""
+        mock_model = MagicMock()
+
+        # Mock predict to raise exception
+        mock_model.predict.side_effect = Exception("Model inference error")
+
+        # Mock sentence_transformers module
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(query="test", rerank_field="content")
+
+            query_results = {"vector1": [Doc(id="1", fields={"content": "Document 1"})]}
+
+            with pytest.raises(RuntimeError, match="Failed to compute rerank scores"):
+                reranker.rerank(query_results)
+
+    def test_rerank_with_custom_batch_size(self):
+        """Test rerank uses custom batch_size."""
+        mock_model = MagicMock()
+
+        import numpy as np
+
+        mock_scores = np.array([0.9, 0.8])
+        mock_model.predict.return_value = mock_scores
+
+        # Mock sentence_transformers module
+        mock_st = MagicMock()
+        mock_st.CrossEncoder.return_value = mock_model
+
+        with patch(
+            "zvec.extension.sentence_transformer_rerank_function.require_module",
+            return_value=mock_st,
+        ):
+            reranker = DefaultLocalReRanker(
+                query="test", rerank_field="content", batch_size=64
+            )
+
+            query_results = {
+                "vector1": [
+                    Doc(id="1", fields={"content": "Doc 1"}),
+                    Doc(id="2", fields={"content": "Doc 2"}),
+                ]
+            }
+
+            reranker.rerank(query_results)
+
+            # Verify batch_size is passed to predict
+            call_args = mock_model.predict.call_args
+            assert call_args[1]["batch_size"] == 64
+
+    @pytest.mark.skipif(
+        not RUN_INTEGRATION_TESTS,
+        reason="Integration test skipped. Set ZVEC_RUN_INTEGRATION_TESTS=1 to run.",
+    )
+    def test_real_sentence_transformer_rerank(self):
+        """Integration test with real SentenceTransformer cross-encoder model.
+
+        To run this test, set environment variable:
+            export ZVEC_RUN_INTEGRATION_TESTS=1
+
+        Note: This test requires sentence-transformers package and will
+        download the MS MARCO MiniLM model (~80MB) on first run.
+        """
+        # Create reranker with real model (using default lightweight model)
+        reranker = DefaultLocalReRanker(
+            query="What is machine learning?",
+            topn=3,
+            rerank_field="content",
+        )
+
+        # Prepare test documents
+        query_results = {
+            "vector1": [
+                Doc(
+                    id="1",
+                    score=0.8,
+                    fields={
+                        "content": "Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from data."
+                    },
+                ),
+                Doc(
+                    id="2",
+                    score=0.7,
+                    fields={
+                        "content": "The weather is nice today with clear skies and sunshine."
+                    },
+                ),
+                Doc(
+                    id="3",
+                    score=0.75,
+                    fields={
+                        "content": "Deep learning is a specialized branch of machine learning using neural networks with multiple layers."
+                    },
+                ),
+            ],
+            "vector2": [
+                Doc(
+                    id="4",
+                    score=0.6,
+                    fields={
+                        "content": "Python is a popular programming language for data science and machine learning applications."
+                    },
+                ),
+                Doc(
+                    id="5",
+                    score=0.65,
+                    fields={
+                        "content": "A recipe for chocolate cake includes flour, sugar, eggs, and cocoa powder."
+                    },
+                ),
+            ],
+        }
+
+        # Call real model
+        results = reranker.rerank(query_results)
+
+        # Verify results
+        assert len(results) <= 3, "Should return at most topn documents"
+        assert len(results) > 0, "Should return at least one document"
+
+        # All results should have valid scores
+        for doc in results:
+            assert hasattr(doc, "score"), "Each document should have a score"
+            assert isinstance(doc.score, (int, float)), "Score should be numeric"
+
+        # Verify scores are in descending order
+        scores = [doc.score for doc in results]
+        assert scores == sorted(scores, reverse=True), (
+            "Results should be sorted by score in descending order"
+        )
+
+        # Verify relevant documents are ranked higher
+        # Documents 1, 3, and 4 are about machine learning, should rank higher
+        result_ids = [doc.id for doc in results]
+
+        # At least one of the ML-related documents should be in top results
+        ml_related_docs = {"1", "3", "4"}
+        assert any(doc_id in ml_related_docs for doc_id in result_ids[:2]), (
+            "ML-related documents should rank higher"
+        )
+
+        # Print results for manual verification (useful during development)
+        print("\nSentenceTransformer Reranking results:")
+        for i, doc in enumerate(results, 1):
+            print(f"{i}. ID={doc.id}, Score={doc.score:.4f}")
+            if doc.fields:
+                content = doc.field("content")
+                if content:
+                    print(f"   Content: {content[:80]}...")
diff --git a/python/tests/test_util.py b/python/tests/test_util.py
index bac8926a..c5a56c1b 100644
--- a/python/tests/test_util.py
+++ b/python/tests/test_util.py
@@ -87,8 +87,3 @@ def test_require_module_calls_importlib(mock_import_module):
 
     mock_import_module.assert_called_once_with("test_module")
     assert result is mock_module
-
-
-def test_require_module_with_openai():
-    with pytest.raises(ImportError) as exc_info:
-        require_module("openai")
diff --git a/python/zvec/__init__.py b/python/zvec/__init__.py
index ec35829d..1c8fdfc0 100644
--- a/python/zvec/__init__.py
+++ b/python/zvec/__init__.py
@@ -27,8 +27,27 @@
 
 from . import model as model
 
-# —— Extensions & typing ——
-from .extension import DenseEmbeddingFunction, ReRanker, RrfReRanker, WeightedReRanker
+# —— Extensions ——
+from .extension import (
+    BM25EmbeddingFunction,
+    DefaultLocalDenseEmbedding,
+    DefaultLocalReRanker,
+    DefaultLocalSparseEmbedding,
+    DenseEmbeddingFunction,
+    OpenAIDenseEmbedding,
+    OpenAIFunctionBase,
+    QwenDenseEmbedding,
+    QwenFunctionBase,
+    QwenReRanker,
+    QwenSparseEmbedding,
+    ReRanker,
+    RrfReRanker,
+    SentenceTransformerFunctionBase,
+    SparseEmbeddingFunction,
+    WeightedReRanker,
+)
+
+# —— Typing ——
 from .model import param as param
 from .model import schema as schema
 
@@ -100,10 +119,22 @@
     "HnswQueryParam",
     "IVFQueryParam",
     # Extensions
-    "ReRanker",
     "DenseEmbeddingFunction",
+    "SparseEmbeddingFunction",
+    "QwenFunctionBase",
+    "OpenAIFunctionBase",
+    "SentenceTransformerFunctionBase",
+    "ReRanker",
+    "DefaultLocalDenseEmbedding",
+    "DefaultLocalSparseEmbedding",
+    "BM25EmbeddingFunction",
+    "OpenAIDenseEmbedding",
+    "QwenDenseEmbedding",
+    "QwenSparseEmbedding",
     "RrfReRanker",
     "WeightedReRanker",
+    "DefaultLocalReRanker",
+    "QwenReRanker",
     # Typing
     "DataType",
     "MetricType",
diff --git a/python/zvec/common/constants.py b/python/zvec/common/constants.py
index 56b82fde..c8da216c 100644
--- a/python/zvec/common/constants.py
+++ b/python/zvec/common/constants.py
@@ -16,7 +16,19 @@
 from typing import Optional, Union
 
 import numpy as np
+from typing_extensions import TypeVar
 
+# VectorType: DenseVectorType | SparseVectorType
 DenseVectorType = Union[list[float], list[int], np.ndarray]
 SparseVectorType = dict[int, float]
 VectorType = Optional[Union[DenseVectorType, SparseVectorType]]
+
+# Embeddable: Text | Image | Audio
+TEXT = str
+IMAGE = Union[str, bytes, np.ndarray]  # file path, raw bytes, or numpy array
+AUDIO = Union[str, bytes, np.ndarray]  # file path, raw bytes, or numpy array
+
+Embeddable = Optional[Union[TEXT, IMAGE, AUDIO]]
+
+# Multimodal Embeddable
+MD = TypeVar("MD", bound=Embeddable, contravariant=True)
diff --git a/python/zvec/extension/__init__.py b/python/zvec/extension/__init__.py
index 83421b50..597f91be 100644
--- a/python/zvec/extension/__init__.py
+++ b/python/zvec/extension/__init__.py
@@ -13,14 +13,37 @@
 # limitations under the License.
 from __future__ import annotations
 
-from .embedding import DenseEmbeddingFunction, QwenEmbeddingFunction
-from .rerank import QwenReRanker, ReRanker, RrfReRanker, WeightedReRanker
+from .bm25_embedding_function import BM25EmbeddingFunction
+from .embedding_function import DenseEmbeddingFunction, SparseEmbeddingFunction
+from .multi_vector_reranker import RrfReRanker, WeightedReRanker
+from .openai_embedding_function import OpenAIDenseEmbedding
+from .openai_function import OpenAIFunctionBase
+from .qwen_embedding_function import QwenDenseEmbedding, QwenSparseEmbedding
+from .qwen_function import QwenFunctionBase
+from .qwen_rerank_function import QwenReRanker
+from .rerank_function import RerankFunction as ReRanker
+from .sentence_transformer_embedding_function import (
+    DefaultLocalDenseEmbedding,
+    DefaultLocalSparseEmbedding,
+)
+from .sentence_transformer_function import SentenceTransformerFunctionBase
+from .sentence_transformer_rerank_function import DefaultLocalReRanker
 
 __all__ = [
+    "BM25EmbeddingFunction",
+    "DefaultLocalDenseEmbedding",
+    "DefaultLocalReRanker",
+    "DefaultLocalSparseEmbedding",
     "DenseEmbeddingFunction",
-    "QwenEmbeddingFunction",
+    "OpenAIDenseEmbedding",
+    "OpenAIFunctionBase",
+    "QwenDenseEmbedding",
+    "QwenFunctionBase",
     "QwenReRanker",
+    "QwenSparseEmbedding",
     "ReRanker",
     "RrfReRanker",
+    "SentenceTransformerFunctionBase",
+    "SparseEmbeddingFunction",
     "WeightedReRanker",
 ]
diff --git a/python/zvec/extension/bm25_embedding_function.py b/python/zvec/extension/bm25_embedding_function.py
new file mode 100644
index 00000000..51ab5ac5
--- /dev/null
+++ b/python/zvec/extension/bm25_embedding_function.py
@@ -0,0 +1,375 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from functools import lru_cache
+from typing import Literal, Optional
+
+from ..common.constants import TEXT, SparseVectorType
+from ..tool import require_module
+from .embedding_function import SparseEmbeddingFunction
+
+
+class BM25EmbeddingFunction(SparseEmbeddingFunction[TEXT]):
+    """BM25-based sparse embedding function using DashText SDK.
+
+    This class provides text-to-sparse-vector embedding capabilities using
+    the DashText library with BM25 algorithm. BM25 (Best Matching 25) is a
+    probabilistic retrieval function used for lexical search and document
+    ranking based on term frequency and inverse document frequency.
+
+    BM25 generates sparse vectors where each dimension corresponds to a term in
+    the vocabulary, and the value represents the BM25 score for that term. It's
+    particularly effective for:
+
+    - Lexical search and keyword matching
+    - Document ranking and information retrieval
+    - Combining with dense embeddings for hybrid search
+    - Traditional IR tasks where exact term matching is important
+
+    This implementation uses DashText's SparseVectorEncoder, which provides
+    efficient BM25 computation for Chinese and English text using either a
+    built-in encoder or custom corpus training.
+
+    Args:
+        corpus (Optional[list[str]], optional): List of documents to train the
+            BM25 encoder. If provided, creates a custom encoder trained on this
+            corpus for better domain-specific accuracy. If ``None``, uses the
+            built-in encoder. Defaults to ``None``.
+        encoding_type (Literal["query", "document"], optional): Encoding mode
+            for text processing. Use ``"query"`` for search queries (default) and
+            ``"document"`` for document indexing. This distinction optimizes the
+            BM25 scoring for asymmetric retrieval tasks. Defaults to ``"query"``.
+        language (Literal["zh", "en"], optional): Language for built-in encoder.
+            Only used when corpus is None. ``"zh"`` for Chinese (trained on Chinese
+            Wikipedia), ``"en"`` for English. Defaults to ``"zh"``.
+        b (float, optional): Document length normalization parameter for BM25.
+            Range [0, 1]. 0 means no normalization, 1 means full normalization.
+            Only used with custom corpus. Defaults to ``0.75``.
+        k1 (float, optional): Term frequency saturation parameter for BM25.
+            Higher values give more weight to term frequency. Only used with
+            custom corpus. Defaults to ``1.2``.
+        **kwargs: Additional parameters for DashText encoder customization.
+
+    Attributes:
+        corpus_size (int): Number of documents in the training corpus (0 if using built-in encoder).
+        encoding_type (str): The encoding type being used ("query" or "document").
+        language (str): The language of the built-in encoder ("zh" or "en").
+
+    Raises:
+        ValueError: If corpus is provided but empty or contains non-string elements.
+        TypeError: If input to ``embed()`` is not a string.
+        RuntimeError: If DashText encoder initialization or training fails.
+
+    Note:
+        - Requires Python 3.10, 3.11, or 3.12
+        - Requires the ``dashtext`` package: ``pip install dashtext``
+        - Two encoder options available:
+
+          1. **Built-in encoder** (no corpus needed): Pre-trained models for
+             Chinese (zh) and English (en), good generalization, works out-of-the-box
+          2. **Custom encoder** (corpus required): Better accuracy for domain-specific
+             terminology, requires training on your full corpus with BM25 parameters
+
+        - Encoding types:
+
+          * ``encoding_type="query"``: Optimized for search queries (shorter text)
+          * ``encoding_type="document"``: Optimized for document indexing (longer text)
+
+        - BM25 parameters (b, k1) only apply to custom encoder training
+        - Output is sorted by indices (vocabulary term IDs) for consistency
+        - Results are cached (LRU cache, maxsize=10) to reduce computation
+        - No API key or network connectivity required (local computation)
+
+    Examples:
+        >>> # Option 1: Using built-in encoder for Chinese (no corpus needed)
+        >>> from zvec.extension import BM25EmbeddingFunction
+        >>>
+        >>> # For query encoding (Chinese)
+        >>> bm25_query_zh = BM25EmbeddingFunction(language="zh", encoding_type="query")
+        >>> query_vec = bm25_query_zh.embed("什么是机器学习")
+        >>> isinstance(query_vec, dict)
+        True
+        >>> # query_vec: {1169440797: 0.29, 2045788977: 0.70, ...}
+
+        >>> # For document encoding (Chinese)
+        >>> bm25_doc_zh = BM25EmbeddingFunction(language="zh", encoding_type="document")
+        >>> doc_vec = bm25_doc_zh.embed("机器学习是人工智能的一个重要分支...")
+        >>> isinstance(doc_vec, dict)
+        True
+
+        >>> # Using built-in encoder for English
+        >>> bm25_query_en = BM25EmbeddingFunction(language="en", encoding_type="query")
+        >>> query_vec_en = bm25_query_en.embed("what is vector search service")
+        >>> isinstance(query_vec_en, dict)
+        True
+
+        >>> # Option 2: Using custom corpus for domain-specific accuracy
+        >>> corpus = [
+        ...     "机器学习是人工智能的一个重要分支",
+        ...     "深度学习使用多层神经网络进行特征提取",
+        ...     "自然语言处理技术用于理解和生成人类语言"
+        ... ]
+        >>> bm25_custom = BM25EmbeddingFunction(
+        ...     corpus=corpus,
+        ...     encoding_type="query",
+        ...     b=0.75,
+        ...     k1=1.2
+        ... )
+        >>> custom_vec = bm25_custom.embed("机器学习算法")
+        >>> isinstance(custom_vec, dict)
+        True
+
+        >>> # Hybrid search: combining with dense embeddings
+        >>> from zvec.extension import DefaultLocalDenseEmbedding
+        >>> dense_emb = DefaultLocalDenseEmbedding()
+        >>> bm25_emb = BM25EmbeddingFunction(language="zh", encoding_type="query")
+        >>>
+        >>> query = "machine learning algorithms"
+        >>> dense_vec = dense_emb.embed(query)  # Semantic similarity
+        >>> sparse_vec = bm25_emb.embed(query)  # Lexical matching
+        >>> # Combine scores for hybrid retrieval
+
+        >>> # Callable interface
+        >>> sparse_vec = bm25_query_zh("information retrieval")
+        >>> isinstance(sparse_vec, dict)
+        True
+
+        >>> # Error handling
+        >>> try:
+        ...     bm25_query_zh.embed("")  # Empty query
+        ... except ValueError as e:
+        ...     print(f"Error: {e}")
+        Error: Input text cannot be empty or whitespace only
+
+    See Also:
+        - ``SparseEmbeddingFunction``: Base class for sparse embeddings
+        - ``DefaultLocalSparseEmbedding``: SPLADE-based sparse embedding
+        - ``QwenSparseEmbedding``: API-based sparse embedding using Qwen
+        - ``DefaultLocalDenseEmbedding``: Dense embedding for semantic search
+
+    References:
+        - DashText Documentation: https://help.aliyun.com/zh/document_detail/2546039.html
+        - DashText PyPI: https://pypi.org/project/dashtext/
+        - BM25 Algorithm: Robertson & Zaragoza (2009)
+    """
+
+    def __init__(
+        self,
+        corpus: Optional[list[str]] = None,
+        encoding_type: Literal["query", "document"] = "query",
+        language: Literal["zh", "en"] = "zh",
+        b: float = 0.75,
+        k1: float = 1.2,
+        **kwargs,
+    ):
+        """Initialize the BM25 embedding function.
+
+        Args:
+            corpus (Optional[list[str]]): Optional corpus for training custom encoder.
+                If None, uses built-in encoder. Defaults to None.
+            encoding_type (Literal["query", "document"]): Text encoding mode.
+                Use "query" for search queries, "document" for indexing.
+                Defaults to "query".
+            language (Literal["zh", "en"]): Language for built-in encoder.
+                "zh" for Chinese, "en" for English. Defaults to "zh".
+            b (float): Document length normalization for BM25 [0, 1].
+                Only used with custom corpus. Defaults to 0.75.
+            k1 (float): Term frequency saturation for BM25.
+                Only used with custom corpus. Defaults to 1.2.
+            **kwargs: Additional DashText encoder parameters.
+
+        Raises:
+            ValueError: If corpus is provided but empty or invalid.
+            ImportError: If dashtext package is not installed.
+            RuntimeError: If encoder initialization or training fails.
+        """
+        # Validate corpus if provided
+        if corpus is not None:
+            if not corpus or not isinstance(corpus, list):
+                raise ValueError("Corpus must be a non-empty list of strings")
+
+            if not all(isinstance(doc, str) for doc in corpus):
+                raise ValueError("All corpus documents must be strings")
+
+        # Import dashtext
+        self._dashtext = require_module("dashtext")
+
+        self._corpus = corpus
+        self._encoding_type = encoding_type
+        self._language = language
+        self._b = b
+        self._k1 = k1
+        self._extra_params = kwargs
+
+        # Initialize the BM25 encoder
+        self._build_encoder()
+
+    def _build_encoder(self):
+        """Build the BM25 sparse vector encoder.
+
+        Creates either a built-in encoder (pre-trained) or a custom encoder
+        trained on the provided corpus.
+
+        Raises:
+            RuntimeError: If encoder initialization or training fails.
+            ImportError: If dashtext package is not installed.
+        """
+        try:
+            if self._corpus is None:
+                # Use built-in encoder (pre-trained on Wikipedia)
+                # language: 'zh' for Chinese, 'en' for English
+                self._encoder = self._dashtext.SparseVectorEncoder.default(
+                    name=self._language
+                )
+            else:
+                # Create custom encoder with BM25 parameters
+                self._encoder = self._dashtext.SparseVectorEncoder(
+                    b=self._b, k1=self._k1, **self._extra_params
+                )
+
+                # Train encoder with the corpus
+                self._encoder.train(self._corpus)
+
+        except ImportError as e:
+            raise ImportError(
+                "dashtext package is required for BM25EmbeddingFunction. "
+                "Install it with: pip install dashtext"
+            ) from e
+        except Exception as e:
+            if isinstance(e, (ValueError, RuntimeError)):
+                raise
+            raise RuntimeError(f"Failed to build BM25 encoder: {e!s}") from e
+
+    @property
+    def corpus_size(self) -> int:
+        """int: Number of documents in the training corpus (0 if using built-in encoder)."""
+        return len(self._corpus) if self._corpus is not None else 0
+
+    @property
+    def encoding_type(self) -> str:
+        """str: The encoding type being used ("query" or "document")."""
+        return self._encoding_type
+
+    @property
+    def language(self) -> str:
+        """str: The language of the built-in encoder ("zh" or "en")."""
+        return self._language
+
+    @property
+    def extra_params(self) -> dict:
+        """dict: Extra parameters for DashText encoder customization."""
+        return self._extra_params
+
+    def __call__(self, input: TEXT) -> SparseVectorType:
+        """Make the embedding function callable.
+
+        Args:
+            input (TEXT): Input text to embed.
+
+        Returns:
+            SparseVectorType: Sparse vector as dictionary.
+        """
+        return self.embed(input)
+
+    @lru_cache(maxsize=10)
+    def embed(self, input: TEXT) -> SparseVectorType:
+        """Generate BM25 sparse embedding for the input text.
+
+        This method computes BM25 scores for the input text using DashText's
+        SparseVectorEncoder. The encoding behavior depends on the encoding_type:
+
+        - ``encoding_type="query"``: Uses ``encode_queries()`` for search queries
+        - ``encoding_type="document"``: Uses ``encode_documents()`` for documents
+
+        The result is a sparse vector where keys are term indices in the
+        vocabulary and values are BM25 scores.
+
+        Args:
+            input (TEXT): Input text string to embed. Must be non-empty after
+                stripping whitespace.
+
+        Returns:
+            SparseVectorType: A dictionary mapping vocabulary term index to BM25 score.
+                Only non-zero scores are included. The dictionary is sorted by indices
+                (keys) in ascending order for consistent output.
+                Example: ``{1169440797: 0.29, 2045788977: 0.70, ...}``
+
+        Raises:
+            TypeError: If ``input`` is not a string.
+            ValueError: If input is empty or whitespace-only.
+            RuntimeError: If BM25 encoding fails.
+
+        Examples:
+            >>> bm25 = BM25EmbeddingFunction(language="zh", encoding_type="query")
+            >>> sparse_vec = bm25.embed("query text")
+            >>> isinstance(sparse_vec, dict)
+            True
+            >>> all(isinstance(k, int) and isinstance(v, float) for k, v in sparse_vec.items())
+            True
+
+            >>> # Verify sorted output
+            >>> keys = list(sparse_vec.keys())
+            >>> keys == sorted(keys)
+            True
+
+            >>> # Error: empty input
+            >>> bm25.embed("   ")
+            ValueError: Input text cannot be empty or whitespace only
+
+            >>> # Error: non-string input
+            >>> bm25.embed(123)
+            TypeError: Expected 'input' to be str, got int
+
+        Note:
+            - BM25 scores are relative to the vocabulary statistics
+            - Output dictionary is always sorted by indices for consistency
+            - Terms not in the vocabulary will have zero scores (not included)
+            - This method is cached (maxsize=10) for performance
+            - DashText automatically handles Chinese/English text segmentation
+        """
+        if not isinstance(input, str):
+            raise TypeError(f"Expected 'input' to be str, got {type(input).__name__}")
+
+        input = input.strip()
+        if not input:
+            raise ValueError("Input text cannot be empty or whitespace only")
+
+        try:
+            # Encode based on encoding_type
+            if self._encoding_type == "query":
+                sparse_vector = self._encoder.encode_queries(input)
+            else:  # encoding_type == "document"
+                sparse_vector = self._encoder.encode_documents(input)
+
+            # DashText returns dict with int/long keys and float values
+            # Convert to standard format: {int: float}
+            sparse_dict: dict[int, float] = {}
+            for key, value in sparse_vector.items():
+                try:
+                    idx = int(key)
+                    val = float(value)
+                    if val > 0:
+                        sparse_dict[idx] = val
+                except (ValueError, TypeError):
+                    # Skip invalid entries
+                    continue
+
+            # Sort by indices (keys) to ensure consistent ordering
+            return dict(sorted(sparse_dict.items()))
+
+        except Exception as e:
+            if isinstance(e, (TypeError, ValueError)):
+                raise
+            raise RuntimeError(f"Failed to generate BM25 embedding: {e!s}") from e
diff --git a/python/zvec/extension/embedding.py b/python/zvec/extension/embedding.py
deleted file mode 100644
index 1bbb0969..00000000
--- a/python/zvec/extension/embedding.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright 2025-present the zvec project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import os
-from abc import ABC, abstractmethod
-from functools import lru_cache
-from http import HTTPStatus
-from typing import Optional, Union
-
-from ..tool import require_module
-from ..typing import DataType
-
-
-class DenseEmbeddingFunction(ABC):
-    """Abstract base class for dense vector embedding functions.
-
-    Dense embedding functions map text to fixed-length real-valued vectors.
-    Subclasses must implement the ``embed()`` method.
-
-    Args:
-        dimension (int): Dimensionality of the output embedding vector.
-        data_type (DataType, optional): Numeric type of the embedding.
-            Defaults to ``DataType.VECTOR_FP32``.
-
-    Note:
-        This class is callable: ``embedding_func("text")`` is equivalent to
-        ``embedding_func.embed("text")``.
-    """
-
-    def __init__(self, dimension: int, data_type: DataType = DataType.VECTOR_FP32):
-        self._dimension = dimension
-        self._data_type = data_type
-
-    @property
-    def dimension(self) -> int:
-        """int: The expected dimensionality of the embedding vector."""
-        return self._dimension
-
-    @property
-    def data_type(self) -> DataType:
-        """DataType: The numeric data type of the embedding (e.g., VECTOR_FP32)."""
-        return self._data_type
-
-    @abstractmethod
-    def embed(self, text: str) -> list[Union[int, float]]:
-        """Generate a dense embedding vector for the input text.
-
-        Args:
-            text (str): Input text to embed.
-
-        Returns:
-            list[Union[int, float]]: A list of numbers representing the embedding.
-                Length must equal ``self.dimension``.
-        """
-        raise NotImplementedError
-
-    def __call__(self, text: str) -> list[Union[int, float]]:
-        return self.embed(text)
-
-
-class SparseEmbeddingFunction(ABC):
-    """Abstract base class for sparse vector embedding functions.
-
-    Sparse embedding functions map text to a dictionary of {index: weight},
-    where only non-zero dimensions are stored.
-
-    Note:
-        Subclasses must implement the ``embed()`` method.
-    """
-
-    @abstractmethod
-    def embed(self, text: str) -> dict[int, float]:
-        """Generate a sparse embedding for the input text.
-        Args:
-            text (str): Input text to embed.
-
-        Returns:
-            dict[int, float]: Mapping from dimension index to non-zero weight.
-        """
-        raise NotImplementedError
-
-
-class QwenEmbeddingFunction(DenseEmbeddingFunction):
-    """Dense embedding function using Qwen (DashScope) Text Embedding API.
-
-    This implementation uses the DashScope service to generate embeddings
-    via Qwen's text embedding models (e.g., ``text-embedding-v4``).
-
-    Args:
-        dimension (int): Desired embedding dimension (e.g., 1024).
-        model (str, optional): DashScope embedding model name.
-            Defaults to ``"text-embedding-v4"``.
-        api_key (Optional[str], optional): DashScope API key. If not provided,
-            reads from ``DASHSCOPE_API_KEY`` environment variable.
-
-    Raises:
-        ValueError: If API key is missing or input text is invalid.
-
-    Note:
-        Requires the ``dashscope`` Python package.
-        Embedding results are cached using ``functools.lru_cache`` (maxsize=10).
-    """
-
-    def __init__(
-        self,
-        dimension: int,
-        model: str = "text-embedding-v4",
-        api_key: Optional[str] = None,
-    ):
-        super().__init__(dimension, DataType.VECTOR_FP32)
-        self._model = model
-        self._api_key = api_key or os.environ.get("DASHSCOPE_API_KEY")
-        if not self._api_key:
-            raise ValueError("DashScope API key is required")
-
-    @property
-    def model(self) -> str:
-        """str: The DashScope embedding model name in use."""
-        return self._model
-
-    def _connection(self):
-        dashscope = require_module("dashscope")
-        dashscope.api_key = self._api_key
-        return dashscope
-
-    @lru_cache(maxsize=10)
-    def embed(self, text: str) -> list[Union[int, float]]:
-        """
-        Generate embedding for a given text using Qwen (via DashScope).
-
-        Args:
-            text (str): Input text to embed. Must be non-empty and valid string.
-
-        Returns:
-            list[Union[int, float]]: The dense embedding vector.
-
-        Raises:
-            ValueError: If input is invalid or API response is malformed.
-            RuntimeError: If network or internal error occurs during API call.
-        """
-        if not isinstance(text, str):
-            raise TypeError(f"Expected 'text' to be str, got {type(text).__name__}")
-
-        text = text.strip()
-        if not text:
-            raise ValueError("Input text cannot be empty or whitespace only")
-
-        resp = self._connection().TextEmbedding.call(
-            model=self.model, input=text, dimension=self.dimension, output_type="dense"
-        )
-
-        if resp.status_code != HTTPStatus.OK:
-            error_msg = getattr(resp, "message", "Unknown error")
-            error_detail = f"Status={resp.status_code}, Message={error_msg}"
-            raise ValueError(f"QwenEmbedding failed: {error_detail}")
-
-        output = getattr(resp, "output", None)
-        if not isinstance(output, dict):
-            raise ValueError("Invalid response: missing or malformed 'output' field")
-
-        embeddings = output.get("embeddings")
-        if not isinstance(embeddings, list):
-            raise ValueError(
-                "Invalid response: 'embeddings' field is missing or not a list"
-            )
-
-        if len(embeddings) != 1:
-            raise ValueError(
-                f"Expected 1 embedding, got {len(embeddings)}. Response: {resp}"
-            )
-
-        first_emb = embeddings[0]
-        if not isinstance(first_emb, dict):
-            raise ValueError("Invalid response: embedding item is not a dictionary")
-
-        return list(first_emb.get("embedding"))
diff --git a/python/zvec/extension/embedding_function.py b/python/zvec/extension/embedding_function.py
new file mode 100644
index 00000000..a58ba239
--- /dev/null
+++ b/python/zvec/extension/embedding_function.py
@@ -0,0 +1,148 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from abc import abstractmethod
+
+from typing_extensions import Protocol, runtime_checkable
+
+from ..common.constants import MD, DenseVectorType, SparseVectorType
+
+
+@runtime_checkable
+class DenseEmbeddingFunction(Protocol[MD]):
+    """Protocol for dense vector embedding functions.
+
+    Dense embedding functions map multimodal input (text, image, or audio) to
+    fixed-length real-valued vectors. This is a Protocol class that defines
+    the interface - implementations should provide their own initialization
+    and properties.
+
+    Type Parameters:
+        MD: The type of input data (bound to Embeddable: TEXT, IMAGE, or AUDIO).
+
+    Note:
+        - This is a Protocol class - it only defines the ``embed()`` interface.
+        - Implementations are free to define their own ``__init__``, properties,
+          and additional methods as needed.
+        - The ``embed()`` method is the only required interface.
+
+    Examples:
+        >>> # Custom text embedding implementation
+        >>> class MyTextEmbedding:
+        ...     def __init__(self, dimension: int, model_name: str):
+        ...         self.dimension = dimension
+        ...         self.model = load_model(model_name)
+        ...
+        ...     def embed(self, input: str) -> list[float]:
+        ...         return self.model.encode(input).tolist()
+
+        >>> # Custom image embedding implementation
+        >>> class MyImageEmbedding:
+        ...     def __init__(self, dimension: int = 512):
+        ...         self.dimension = dimension
+        ...         self.model = load_image_model()
+        ...
+        ...     def embed(self, input: Union[str, bytes, np.ndarray]) -> list[float]:
+        ...         if isinstance(input, str):
+        ...             image = load_image_from_path(input)
+        ...         else:
+        ...             image = input
+        ...         return self.model.extract_features(image).tolist()
+
+        >>> # Using built-in implementations
+        >>> from zvec.extension import QwenDenseEmbedding
+        >>> text_emb = QwenDenseEmbedding(dimension=768, api_key="sk-xxx")
+        >>> vector = text_emb.embed("Hello world")
+    """
+
+    @abstractmethod
+    def embed(self, input: MD) -> DenseVectorType:
+        """Generate a dense embedding vector for the input data.
+
+        Args:
+            input (MD): Multimodal input data to embed. Can be:
+                - TEXT (str): Text string
+                - IMAGE (str | bytes | np.ndarray): Image file path, raw bytes, or array
+                - AUDIO (str | bytes | np.ndarray): Audio file path, raw bytes, or array
+
+        Returns:
+            DenseVectorType: A dense vector representing the embedding.
+                Can be list[float], list[int], or np.ndarray.
+                Length should match the implementation's dimension.
+        """
+        ...
+
+
+@runtime_checkable
+class SparseEmbeddingFunction(Protocol[MD]):
+    """Abstract base class for sparse vector embedding functions.
+
+    Sparse embedding functions map multimodal input (text, image, or audio) to
+    a dictionary of {index: weight}, where only non-zero dimensions are stored.
+    You can inherit this class to create custom sparse embedding functions.
+
+    Type Parameters:
+        MD: The type of input data (bound to Embeddable: TEXT, IMAGE, or AUDIO).
+
+    Note:
+        Subclasses must implement the ``embed()`` method.
+
+    Examples:
+        >>> # Using built-in text sparse embedding (e.g., BM25, TF-IDF)
+        >>> sparse_emb = SomeSparseEmbedding()
+        >>> vector = sparse_emb.embed("Hello world")
+        >>> # Returns: {0: 0.5, 42: 1.2, 100: 0.8}
+
+        >>> # Custom BM25 sparse embedding function
+        >>> class MyBM25Embedding(SparseEmbeddingFunction):
+        ...     def __init__(self, vocab_size: int = 10000):
+        ...         self.vocab_size = vocab_size
+        ...         self.tokenizer = MyTokenizer()
+        ...
+        ...     def embed(self, input: str) -> dict[int, float]:
+        ...         tokens = self.tokenizer.tokenize(input)
+        ...         sparse_vector = {}
+        ...         for token_id, weight in self._calculate_bm25(tokens):
+        ...             if weight > 0:
+        ...                 sparse_vector[token_id] = weight
+        ...         return sparse_vector
+        ...
+        ...     def _calculate_bm25(self, tokens):
+        ...         # BM25 calculation logic
+        ...         pass
+
+        >>> # Custom sparse image feature extractor
+        >>> class MySparseImageEmbedding(SparseEmbeddingFunction):
+        ...     def embed(self, input: Union[str, bytes, np.ndarray]) -> dict[int, float]:
+        ...         image = self._load_image(input)
+        ...         features = self._extract_sparse_features(image)
+        ...         return {idx: val for idx, val in enumerate(features) if val != 0}
+    """
+
+    @abstractmethod
+    def embed(self, input: MD) -> SparseVectorType:
+        """Generate a sparse embedding for the input data.
+
+        Args:
+            input (MD): Multimodal input data to embed. Can be:
+                - TEXT (str): Text string
+                - IMAGE (str | bytes | np.ndarray): Image file path, raw bytes, or array
+                - AUDIO (str | bytes | np.ndarray): Audio file path, raw bytes, or array
+
+        Returns:
+            SparseVectorType: Mapping from dimension index to non-zero weight.
+                Only dimensions with non-zero values are included.
+        """
+        ...
diff --git a/python/zvec/extension/multi_vector_reranker.py b/python/zvec/extension/multi_vector_reranker.py
new file mode 100644
index 00000000..ba3a2363
--- /dev/null
+++ b/python/zvec/extension/multi_vector_reranker.py
@@ -0,0 +1,174 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import heapq
+import math
+from collections import defaultdict
+from typing import Optional
+
+from ..model.doc import Doc
+from ..typing import MetricType
+from .rerank_function import RerankFunction
+
+
+class RrfReRanker(RerankFunction):
+    """Re-ranker using Reciprocal Rank Fusion (RRF) for multi-vector search.
+
+    RRF combines results from multiple vector queries without requiring relevance scores.
+    It assigns higher weight to documents that appear early in multiple result lists.
+
+    The RRF score for a document at rank ``r`` is: ``1 / (k + r + 1)``,
+    where ``k`` is the rank constant.
+
+    Note:
+        This re-ranker is specifically designed for multi-vector scenarios where
+        query results from multiple vector fields need to be combined.
+
+    Args:
+        topn (int, optional): Number of top documents to return. Defaults to 10.
+        rerank_field (Optional[str], optional): Ignored by RRF. Defaults to None.
+        rank_constant (int, optional): Smoothing constant ``k`` in RRF formula.
+            Larger values reduce the impact of early ranks. Defaults to 60.
+    """
+
+    def __init__(
+        self,
+        topn: int = 10,
+        rerank_field: Optional[str] = None,
+        rank_constant: int = 60,
+    ):
+        super().__init__(topn=topn, rerank_field=rerank_field)
+        self._rank_constant = rank_constant
+
+    @property
+    def rank_constant(self) -> int:
+        return self._rank_constant
+
+    def _rrf_score(self, rank: int) -> float:
+        return 1.0 / (self._rank_constant + rank + 1)
+
+    def rerank(self, query_results: dict[str, list[Doc]]) -> list[Doc]:
+        """Apply Reciprocal Rank Fusion to combine multiple query results.
+
+        Args:
+            query_results (dict[str, list[Doc]]): Results from one or more vector queries.
+
+        Returns:
+            list[Doc]: Re-ranked documents with RRF scores in the ``score`` field.
+        """
+        rrf_scores: dict[str, float] = defaultdict(float)
+        id_to_doc: dict[str, Doc] = {}
+
+        for _, query_result in query_results.items():
+            for rank, doc in enumerate(query_result):
+                doc_id = doc.id
+                rrf_score = self._rrf_score(rank)
+                rrf_scores[doc_id] += rrf_score
+                if doc_id not in id_to_doc:
+                    id_to_doc[doc_id] = doc
+
+        top_docs = heapq.nlargest(self.topn, rrf_scores.items(), key=lambda x: x[1])
+        results: list[Doc] = []
+        for doc_id, rrf_score in top_docs:
+            doc = id_to_doc[doc_id]
+            new_doc = doc._replace(score=rrf_score)
+            results.append(new_doc)
+        return results
+
+
+class WeightedReRanker(RerankFunction):
+    """Re-ranker that combines scores from multiple vector fields using weights.
+
+    Each vector field's relevance score is normalized based on its metric type,
+    then scaled by a user-provided weight. Final scores are summed across fields.
+
+    Note:
+        This re-ranker is specifically designed for multi-vector scenarios where
+        query results from multiple vector fields need to be combined with
+        configurable weights.
+
+    Args:
+        topn (int, optional): Number of top documents to return. Defaults to 10.
+        rerank_field (Optional[str], optional): Ignored. Defaults to None.
+        metric (MetricType, optional): Distance metric used for score normalization.
+            Defaults to ``MetricType.L2``.
+        weights (Optional[dict[str, float]], optional): Weight per vector field.
+            Fields not listed use weight 1.0. Defaults to None.
+
+    Note:
+        Supported metrics: L2, IP, COSINE. Scores are normalized to [0, 1].
+    """
+
+    def __init__(
+        self,
+        topn: int = 10,
+        rerank_field: Optional[str] = None,
+        metric: MetricType = MetricType.L2,
+        weights: Optional[dict[str, float]] = None,
+    ):
+        super().__init__(topn=topn, rerank_field=rerank_field)
+        self._weights = weights or {}
+        self._metric = metric
+
+    @property
+    def weights(self) -> dict[str, float]:
+        """dict[str, float]: Weight mapping for vector fields."""
+        return self._weights
+
+    @property
+    def metric(self) -> MetricType:
+        """MetricType: Distance metric used for score normalization."""
+        return self._metric
+
+    def rerank(self, query_results: dict[str, list[Doc]]) -> list[Doc]:
+        """Combine scores from multiple vector fields using weighted sum.
+
+        Args:
+            query_results (dict[str, list[Doc]]): Results per vector field.
+
+        Returns:
+            list[Doc]: Re-ranked documents with combined scores in ``score`` field.
+        """
+        weighted_scores: dict[str, float] = defaultdict(float)
+        id_to_doc: dict[str, Doc] = {}
+
+        for vector_name, query_result in query_results.items():
+            for _, doc in enumerate(query_result):
+                doc_id = doc.id
+                weighted_score = self._normalize_score(
+                    doc.score, self.metric
+                ) * self.weights.get(vector_name, 1.0)
+                weighted_scores[doc_id] += weighted_score
+                if doc_id not in id_to_doc:
+                    id_to_doc[doc_id] = doc
+
+        top_docs = heapq.nlargest(
+            self.topn, weighted_scores.items(), key=lambda x: x[1]
+        )
+        results: list[Doc] = []
+        for doc_id, weighted_score in top_docs:
+            doc = id_to_doc[doc_id]
+            new_doc = doc._replace(score=weighted_score)
+            results.append(new_doc)
+        return results
+
+    def _normalize_score(self, score: float, metric: MetricType) -> float:
+        if metric == MetricType.L2:
+            return 1.0 - 2 * math.atan(score) / math.pi
+        if metric == MetricType.IP:
+            return 0.5 + math.atan(score) / math.pi
+        if metric == MetricType.COSINE:
+            return 1.0 - score / 2.0
+        raise ValueError("Unsupported metric type")
diff --git a/python/zvec/extension/openai_embedding_function.py b/python/zvec/extension/openai_embedding_function.py
new file mode 100644
index 00000000..03a34ede
--- /dev/null
+++ b/python/zvec/extension/openai_embedding_function.py
@@ -0,0 +1,238 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from functools import lru_cache
+from typing import Optional
+
+from ..common.constants import TEXT, DenseVectorType
+from .embedding_function import DenseEmbeddingFunction
+from .openai_function import OpenAIFunctionBase
+
+
+class OpenAIDenseEmbedding(OpenAIFunctionBase, DenseEmbeddingFunction[TEXT]):
+    """Dense text embedding function using OpenAI API.
+
+    This class provides text-to-vector embedding capabilities using OpenAI's
+    embedding models. It inherits from ``DenseEmbeddingFunction`` and implements
+    dense text embedding via the OpenAI API.
+
+    The implementation supports various OpenAI embedding models with different
+    dimensions and includes automatic result caching for improved performance.
+
+    Args:
+        model (str, optional): OpenAI embedding model identifier.
+            Defaults to ``"text-embedding-3-small"``. Common options:
+            - ``"text-embedding-3-small"``: 1536 dims, cost-efficient, good performance
+            - ``"text-embedding-3-large"``: 3072 dims, highest quality
+            - ``"text-embedding-ada-002"``: 1536 dims, legacy model
+        dimension (Optional[int], optional): Desired output embedding dimension.
+            If ``None``, uses model's default dimension. For text-embedding-3 models,
+            you can specify custom dimensions (e.g., 256, 512, 1024, 1536).
+            Defaults to ``None``.
+        api_key (Optional[str], optional): OpenAI API authentication key.
+            If ``None``, reads from ``OPENAI_API_KEY`` environment variable.
+            Obtain your key from: https://platform.openai.com/api-keys
+        base_url (Optional[str], optional): Custom API base URL for OpenAI-compatible
+            services. Defaults to ``None`` (uses official OpenAI endpoint).
+
+    Attributes:
+        dimension (int): The embedding vector dimension.
+        data_type (DataType): Always ``DataType.VECTOR_FP32`` for this implementation.
+        model (str): The OpenAI model name being used.
+
+    Raises:
+        ValueError: If API key is not provided and not found in environment,
+            or if API returns an error response.
+        TypeError: If input to ``embed()`` is not a string.
+        RuntimeError: If network error or OpenAI service error occurs.
+
+    Note:
+        - Requires Python 3.10, 3.11, or 3.12
+        - Requires the ``openai`` package: ``pip install openai``
+        - Embedding results are cached (LRU cache, maxsize=10) to reduce API calls
+        - Network connectivity to OpenAI API endpoints is required
+        - API usage incurs costs based on your OpenAI subscription plan
+        - Rate limits apply based on your OpenAI account tier
+
+    Examples:
+        >>> # Basic usage with default model
+        >>> from zvec.extension import OpenAIDenseEmbedding
+        >>> import os
+        >>> os.environ["OPENAI_API_KEY"] = "sk-..."
+        >>>
+        >>> emb_func = OpenAIDenseEmbedding()
+        >>> vector = emb_func.embed("Hello, world!")
+        >>> len(vector)
+        1536
+
+        >>> # Using specific model with custom dimension
+        >>> emb_func = OpenAIDenseEmbedding(
+        ...     model="text-embedding-3-large",
+        ...     dimension=1024,
+        ...     api_key="sk-..."
+        ... )
+        >>> vector = emb_func.embed("Machine learning is fascinating")
+        >>> len(vector)
+        1024
+
+        >>> # Using with custom base URL (e.g., Azure OpenAI)
+        >>> emb_func = OpenAIDenseEmbedding(
+        ...     model="text-embedding-ada-002",
+        ...     api_key="your-azure-key",
+        ...     base_url="https://your-resource.openai.azure.com/"
+        ... )
+        >>> vector = emb_func("Natural language processing")
+        >>> isinstance(vector, list)
+        True
+
+        >>> # Batch processing with caching benefit
+        >>> texts = ["First text", "Second text", "First text"]
+        >>> vectors = [emb_func.embed(text) for text in texts]
+        >>> # Third call uses cached result for "First text"
+
+        >>> # Error handling
+        >>> try:
+        ...     emb_func.embed("")  # Empty string
+        ... except ValueError as e:
+        ...     print(f"Error: {e}")
+        Error: Input text cannot be empty or whitespace only
+
+    See Also:
+        - ``DenseEmbeddingFunction``: Base class for dense embeddings
+        - ``QwenDenseEmbedding``: Alternative using Qwen/DashScope API
+        - ``DefaultDenseEmbedding``: Local model without API calls
+        - ``SparseEmbeddingFunction``: Base class for sparse embeddings
+    """
+
+    def __init__(
+        self,
+        model: str = "text-embedding-3-small",
+        dimension: Optional[int] = None,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        **kwargs,
+    ):
+        """Initialize the OpenAI dense embedding function.
+
+        Args:
+            model (str): OpenAI model name. Defaults to "text-embedding-3-small".
+            dimension (Optional[int]): Target embedding dimension or None for default.
+            api_key (Optional[str]): API key or None to use environment variable.
+            base_url (Optional[str]): Custom API base URL or None for default.
+            **kwargs: Additional parameters for API calls. Examples:
+                - ``encoding_format`` (str): Format of embeddings, "float" or "base64".
+                - ``user`` (str): User identifier for tracking.
+
+        Raises:
+            ValueError: If API key is not provided and not in environment.
+        """
+        # Initialize base class for API connection
+        OpenAIFunctionBase.__init__(
+            self, model=model, api_key=api_key, base_url=base_url
+        )
+
+        # Store dimension configuration
+        self._custom_dimension = dimension
+
+        # Determine actual dimension
+        if dimension is None:
+            # Use model default dimension
+            self._dimension = self._MODEL_DIMENSIONS.get(model, 1536)
+        else:
+            self._dimension = dimension
+
+        # Store dense-specific attributes
+        self._extra_params = kwargs
+
+    @property
+    def dimension(self) -> int:
+        """int: The expected dimensionality of the embedding vector."""
+        return self._dimension
+
+    @property
+    def extra_params(self) -> dict:
+        """dict: Extra parameters for model-specific customization."""
+        return self._extra_params
+
+    def __call__(self, input: TEXT) -> DenseVectorType:
+        """Make the embedding function callable."""
+        return self.embed(input)
+
+    @lru_cache(maxsize=10)
+    def embed(self, input: TEXT) -> DenseVectorType:
+        """Generate dense embedding vector for the input text.
+
+        This method calls the OpenAI Embeddings API to convert input text
+        into a dense vector representation. Results are cached to improve
+        performance for repeated inputs.
+
+        Args:
+            input (TEXT): Input text string to embed. Must be non-empty after
+                stripping whitespace. Maximum length is 8191 tokens for most models.
+
+        Returns:
+            DenseVectorType: A list of floats representing the embedding vector.
+                Length equals ``self.dimension``. Example:
+                ``[0.123, -0.456, 0.789, ...]``
+
+        Raises:
+            TypeError: If ``input`` is not a string.
+            ValueError: If input is empty/whitespace-only, or if the API returns
+                an error or malformed response.
+            RuntimeError: If network connectivity issues or OpenAI service
+                errors occur.
+
+        Examples:
+            >>> emb = OpenAIDenseEmbedding()
+            >>> vector = emb.embed("Natural language processing")
+            >>> len(vector)
+            1536
+            >>> isinstance(vector[0], float)
+            True
+
+            >>> # Error: empty input
+            >>> emb.embed("   ")
+            ValueError: Input text cannot be empty or whitespace only
+
+            >>> # Error: non-string input
+            >>> emb.embed(123)
+            TypeError: Expected 'input' to be str, got int
+
+        Note:
+            - This method is cached (maxsize=10). Identical inputs return cached results.
+            - The cache is based on exact string match (case-sensitive).
+            - Consider pre-processing text (lowercasing, normalization) for better caching.
+        """
+        if not isinstance(input, TEXT):
+            raise TypeError(f"Expected 'input' to be str, got {type(input).__name__}")
+
+        input = input.strip()
+        if not input:
+            raise ValueError("Input text cannot be empty or whitespace only")
+
+        # Call API
+        embedding_vector = self._call_text_embedding_api(
+            input=input,
+            dimension=self._custom_dimension,
+        )
+
+        # Verify dimension
+        if len(embedding_vector) != self.dimension:
+            raise ValueError(
+                f"Dimension mismatch: expected {self.dimension}, "
+                f"got {len(embedding_vector)}"
+            )
+
+        return embedding_vector
diff --git a/python/zvec/extension/openai_function.py b/python/zvec/extension/openai_function.py
new file mode 100644
index 00000000..d3f4de2d
--- /dev/null
+++ b/python/zvec/extension/openai_function.py
@@ -0,0 +1,149 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import os
+from typing import ClassVar, Optional
+
+from ..common.constants import TEXT
+from ..tool import require_module
+
+
+class OpenAIFunctionBase:
+    """Base class for OpenAI functions.
+
+    This base class provides common functionality for calling OpenAI APIs
+    and handling responses. It supports embeddings (dense) operations.
+
+    This class is not meant to be used directly. Use concrete implementations:
+    - ``OpenAIDenseEmbedding`` for dense embeddings
+
+    Args:
+        model (str): OpenAI model identifier.
+        api_key (Optional[str]): OpenAI API authentication key.
+        base_url (Optional[str]): Custom API base URL.
+
+    Note:
+        - This is an internal base class for code reuse across OpenAI features
+        - Subclasses should inherit from appropriate Protocol
+        - Provides unified API connection and response handling
+    """
+
+    # Model default dimensions
+    _MODEL_DIMENSIONS: ClassVar[dict[str, int]] = {
+        "text-embedding-3-small": 1536,
+        "text-embedding-3-large": 3072,
+        "text-embedding-ada-002": 1536,
+    }
+
+    def __init__(
+        self,
+        model: str,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+    ):
+        """Initialize the base OpenAI functionality.
+
+        Args:
+            model (str): OpenAI model name.
+            api_key (Optional[str]): API key or None to use environment variable.
+            base_url (Optional[str]): Custom API base URL or None for default.
+
+        Raises:
+            ValueError: If API key is not provided and not in environment.
+        """
+        self._model = model
+        self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
+        self._base_url = base_url
+
+        if not self._api_key:
+            raise ValueError(
+                "OpenAI API key is required. Please provide 'api_key' parameter "
+                "or set the 'OPENAI_API_KEY' environment variable."
+            )
+
+    @property
+    def model(self) -> str:
+        """str: The OpenAI model name currently in use."""
+        return self._model
+
+    def _get_client(self):
+        """Get OpenAI client instance.
+
+        Returns:
+            OpenAI: Configured OpenAI client.
+
+        Raises:
+            ImportError: If openai package is not installed.
+        """
+        openai = require_module("openai")
+
+        if self._base_url:
+            return openai.OpenAI(api_key=self._api_key, base_url=self._base_url)
+        return openai.OpenAI(api_key=self._api_key)
+
+    def _call_text_embedding_api(
+        self,
+        input: TEXT,
+        dimension: Optional[int] = None,
+    ) -> list:
+        """Call OpenAI Embeddings API.
+
+        Args:
+            input (TEXT): Input text to embed.
+            dimension (Optional[int]): Target dimension (for models that support it).
+
+        Returns:
+            list: Embedding vector as list of floats.
+
+        Raises:
+            RuntimeError: If API call fails.
+            ValueError: If API returns error response.
+        """
+        try:
+            client = self._get_client()
+
+            # Prepare embedding parameters
+            params = {"model": self.model, "input": input}
+
+            # Add dimension parameter for models that support it
+            if dimension is not None:
+                params["dimensions"] = dimension
+
+            # Call OpenAI API
+            response = client.embeddings.create(**params)
+
+        except Exception as e:
+            # Check if it's an OpenAI API error
+            openai = require_module("openai")
+            if isinstance(e, (openai.APIError, openai.APIConnectionError)):
+                raise RuntimeError(f"Failed to call OpenAI API: {e!s}") from e
+            raise RuntimeError(f"Unexpected error during API call: {e!s}") from e
+
+        # Extract embedding from response
+        try:
+            if not response.data:
+                raise ValueError("Invalid API response: no embedding data returned")
+
+            embedding_vector = response.data[0].embedding
+
+            if not isinstance(embedding_vector, list):
+                raise ValueError(
+                    "Invalid API response: embedding is not a list of numbers"
+                )
+
+            return embedding_vector
+
+        except (AttributeError, IndexError, TypeError) as e:
+            raise ValueError(f"Failed to parse API response: {e!s}") from e
diff --git a/python/zvec/extension/qwen_embedding_function.py b/python/zvec/extension/qwen_embedding_function.py
new file mode 100644
index 00000000..7bdb69b5
--- /dev/null
+++ b/python/zvec/extension/qwen_embedding_function.py
@@ -0,0 +1,537 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from functools import lru_cache
+from typing import Optional
+
+from ..common.constants import TEXT, DenseVectorType, SparseVectorType
+from .embedding_function import DenseEmbeddingFunction, SparseEmbeddingFunction
+from .qwen_function import QwenFunctionBase
+
+
+class QwenDenseEmbedding(QwenFunctionBase, DenseEmbeddingFunction[TEXT]):
+    """Dense text embedding function using Qwen (DashScope) API.
+
+    This class provides text-to-vector embedding capabilities using Alibaba Cloud's
+    DashScope service and Qwen embedding models. It inherits from
+    ``DenseEmbeddingFunction`` and implements dense text embedding.
+
+    The implementation supports various Qwen embedding models with configurable
+    dimensions and includes automatic result caching for improved performance.
+
+    Args:
+        dimension (int): Desired output embedding dimension. Common values:
+            - 512: Balanced performance and accuracy
+            - 1024: Higher accuracy, larger storage
+            - 1536: Maximum accuracy for supported models
+        model (str, optional): DashScope embedding model identifier.
+            Defaults to ``"text-embedding-v4"``. Other options include:
+            - ``"text-embedding-v3"``
+            - ``"text-embedding-v2"``
+            - ``"text-embedding-v1"``
+        api_key (Optional[str], optional): DashScope API authentication key.
+            If ``None``, reads from ``DASHSCOPE_API_KEY`` environment variable.
+            Obtain your key from: https://dashscope.console.aliyun.com/
+        **kwargs: Additional DashScope API parameters. Supported options:
+            - ``text_type`` (str): Specifies the text role in retrieval tasks.
+              Options: ``"query"`` (search query) or ``"document"`` (indexed content).
+              This parameter optimizes embeddings for asymmetric search scenarios.
+
+            Reference: https://help.aliyun.com/zh/model-studio/text-embedding-synchronous-api
+
+    Attributes:
+        dimension (int): The embedding vector dimension.
+        data_type (DataType): Always ``DataType.VECTOR_FP32`` for this implementation.
+        model (str): The DashScope model name being used.
+
+    Raises:
+        ValueError: If API key is not provided and not found in environment,
+            or if API returns an error response.
+        TypeError: If input to ``embed()`` is not a string.
+        RuntimeError: If network error or DashScope service error occurs.
+
+    Note:
+        - Requires Python 3.10, 3.11, or 3.12
+        - Requires the ``dashscope`` package: ``pip install dashscope``
+        - Embedding results are cached (LRU cache, maxsize=10) to reduce API calls
+        - Network connectivity to DashScope API endpoints is required
+        - API usage may incur costs based on your DashScope subscription plan
+
+        **Parameter Guidelines:**
+
+        - Use ``text_type="query"`` for search queries and ``text_type="document"``
+          for indexed content to optimize asymmetric retrieval tasks.
+        - For detailed API specifications and parameter usage, refer to:
+          https://help.aliyun.com/zh/model-studio/text-embedding-synchronous-api
+
+    Examples:
+        >>> # Basic usage with default model
+        >>> from zvec.extension import QwenDenseEmbedding
+        >>> import os
+        >>> os.environ["DASHSCOPE_API_KEY"] = "your-api-key"
+        >>>
+        >>> emb_func = QwenDenseEmbedding(dimension=1024)
+        >>> vector = emb_func.embed("Hello, world!")
+        >>> len(vector)
+        1024
+
+        >>> # Using specific model with explicit API key
+        >>> emb_func = QwenDenseEmbedding(
+        ...     dimension=512,
+        ...     model="text-embedding-v3",
+        ...     api_key="sk-xxxxx"
+        ... )
+        >>> vector = emb_func("Machine learning is fascinating")
+        >>> isinstance(vector, list)
+        True
+
+        >>> # Using with custom parameters (text_type)
+        >>> # For search queries - optimize for query-document matching
+        >>> emb_func = QwenDenseEmbedding(
+        ...     dimension=1024,
+        ...     text_type="query"
+        ... )
+        >>> query_vector = emb_func.embed("What is machine learning?")
+        >>>
+        >>> # For document embeddings - optimize for being matched by queries
+        >>> doc_emb_func = QwenDenseEmbedding(
+        ...     dimension=1024,
+        ...     text_type="document"
+        ... )
+        >>> doc_vector = doc_emb_func.embed(
+        ...     "Machine learning is a subset of artificial intelligence..."
+        ... )
+
+        >>> # Batch processing with caching benefit
+        >>> texts = ["First text", "Second text", "First text"]
+        >>> vectors = [emb_func.embed(text) for text in texts]
+        >>> # Third call uses cached result for "First text"
+
+        >>> # Error handling
+        >>> try:
+        ...     emb_func.embed("")  # Empty string
+        ... except ValueError as e:
+        ...     print(f"Error: {e}")
+        Error: Input text cannot be empty or whitespace only
+
+    See Also:
+        - ``DenseEmbeddingFunction``: Base class for dense embeddings
+        - ``SparseEmbeddingFunction``: Base class for sparse embeddings
+    """
+
+    def __init__(
+        self,
+        dimension: int,
+        model: str = "text-embedding-v4",
+        api_key: Optional[str] = None,
+        **kwargs,
+    ):
+        """Initialize the Qwen dense embedding function.
+
+        Args:
+            dimension (int): Target embedding dimension.
+            model (str): DashScope model name. Defaults to "text-embedding-v4".
+            api_key (Optional[str]): API key or None to use environment variable.
+            **kwargs: Additional DashScope API parameters. Supported options:
+                - ``text_type`` (str): Text role in asymmetric retrieval.
+                  * ``"query"``: Optimize for search queries (short, question-like).
+                  * ``"document"``: Optimize for indexed documents (longer content).
+                  Using appropriate text_type improves retrieval accuracy by
+                  optimizing the embedding space for query-document matching.
+
+                For detailed API documentation, see:
+                https://help.aliyun.com/zh/model-studio/text-embedding-synchronous-api
+
+        Raises:
+            ValueError: If API key is not provided and not in environment.
+        """
+        # Initialize base class for API connection
+        QwenFunctionBase.__init__(self, model=model, api_key=api_key)
+
+        # Store dense-specific attributes
+        self._dimension = dimension
+        self._extra_params = kwargs
+
+    @property
+    def dimension(self) -> int:
+        """int: The expected dimensionality of the embedding vector."""
+        return self._dimension
+
+    @property
+    def extra_params(self) -> dict:
+        """dict: Extra parameters for model-specific customization."""
+        return self._extra_params
+
+    def __call__(self, input: TEXT) -> DenseVectorType:
+        """Make the embedding function callable."""
+        return self.embed(input)
+
+    @lru_cache(maxsize=10)
+    def embed(self, input: TEXT) -> DenseVectorType:
+        """Generate dense embedding vector for the input text.
+
+        This method calls the DashScope TextEmbedding API to convert input text
+        into a dense vector representation. Results are cached to improve
+        performance for repeated inputs.
+
+        Args:
+            input (TEXT): Input text string to embed. Must be non-empty after
+                stripping whitespace. Maximum length depends on the model used
+                (typically 2048-8192 tokens).
+
+        Returns:
+            DenseVectorType: A list of floats representing the embedding vector.
+                Length equals ``self.dimension``. Example:
+                ``[0.123, -0.456, 0.789, ...]``
+
+        Raises:
+            TypeError: If ``input`` is not a string.
+            ValueError: If input is empty/whitespace-only, or if the API returns
+                an error or malformed response.
+            RuntimeError: If network connectivity issues or DashScope service
+                errors occur.
+
+        Examples:
+            >>> emb = QwenDenseEmbedding(dimension=1024)
+            >>> vector = emb.embed("Natural language processing")
+            >>> len(vector)
+            1024
+            >>> isinstance(vector[0], float)
+            True
+
+            >>> # Error: empty input
+            >>> emb.embed("   ")
+            ValueError: Input text cannot be empty or whitespace only
+
+            >>> # Error: non-string input
+            >>> emb.embed(123)
+            TypeError: Expected 'input' to be str, got int
+
+        Note:
+            - This method is cached (maxsize=10). Identical inputs return cached results.
+            - The cache is based on exact string match (case-sensitive).
+            - Consider pre-processing text (lowercasing, normalization) for better caching.
+        """
+        if not isinstance(input, TEXT):
+            raise TypeError(f"Expected 'input' to be str, got {type(input).__name__}")
+
+        input = input.strip()
+        if not input:
+            raise ValueError("Input text cannot be empty or whitespace only")
+
+        # Call API with dense output type
+        output = self._call_text_embedding_api(
+            input=input,
+            dimension=self.dimension,
+            output_type="dense",
+            text_type=self.extra_params.get("text_type"),
+        )
+
+        embeddings = output.get("embeddings")
+        if not isinstance(embeddings, list):
+            raise ValueError(
+                "Invalid API response: 'embeddings' field is missing or not a list"
+            )
+
+        if len(embeddings) != 1:
+            raise ValueError(
+                f"Expected exactly 1 embedding in response, got {len(embeddings)}"
+            )
+
+        first_emb = embeddings[0]
+        if not isinstance(first_emb, dict):
+            raise ValueError("Invalid API response: embedding item is not a dictionary")
+
+        embedding_vector = first_emb.get("embedding")
+        if not isinstance(embedding_vector, list):
+            raise ValueError(
+                "Invalid API response: 'embedding' field is missing or not a list"
+            )
+
+        if len(embedding_vector) != self.dimension:
+            raise ValueError(
+                f"Dimension mismatch: expected {self.dimension}, "
+                f"got {len(embedding_vector)}"
+            )
+
+        return list(embedding_vector)
+
+
+class QwenSparseEmbedding(QwenFunctionBase, SparseEmbeddingFunction[TEXT]):
+    """Sparse text embedding function using Qwen (DashScope) API.
+
+    This class provides text-to-sparse-vector embedding capabilities using
+    Alibaba Cloud's DashScope service and Qwen embedding models. It generates
+    sparse keyword-weighted vectors suitable for lexical matching and BM25-style
+    retrieval scenarios.
+
+    Sparse embeddings are particularly useful for:
+    - Keyword-based search and exact matching
+    - Hybrid retrieval (combining with dense embeddings)
+    - Interpretable search results (weights show term importance)
+
+    Args:
+        dimension (int): Desired output embedding dimension. Common values:
+            - 512: Balanced performance and accuracy
+            - 1024: Higher accuracy, larger storage
+            - 1536: Maximum accuracy for supported models
+        model (str, optional): DashScope embedding model identifier.
+            Defaults to ``"text-embedding-v4"``. Other options include:
+            - ``"text-embedding-v3"``
+            - ``"text-embedding-v2"``
+        api_key (Optional[str], optional): DashScope API authentication key.
+            If ``None``, reads from ``DASHSCOPE_API_KEY`` environment variable.
+            Obtain your key from: https://dashscope.console.aliyun.com/
+        **kwargs: Additional DashScope API parameters. Supported options:
+            - ``encoding_type`` (Literal["query", "document"]): Encoding type.
+              * ``"query"``: Optimize for search queries (default).
+              * ``"document"``: Optimize for indexed documents.
+              This distinction is important for asymmetric retrieval tasks.
+
+    Attributes:
+        model (str): The DashScope model name being used.
+        encoding_type (str): The encoding type ("query" or "document").
+
+    Raises:
+        ValueError: If API key is not provided and not found in environment,
+            or if API returns an error response.
+        TypeError: If input to ``embed()`` is not a string.
+        RuntimeError: If network error or DashScope service error occurs.
+
+    Note:
+        - Requires Python 3.10, 3.11, or 3.12
+        - Requires the ``dashscope`` package: ``pip install dashscope``
+        - Embedding results are cached (LRU cache, maxsize=10) to reduce API calls
+        - Network connectivity to DashScope API endpoints is required
+        - API usage may incur costs based on your DashScope subscription plan
+        - Sparse vectors have only non-zero dimensions stored as dict
+        - Output is sorted by indices (keys) in ascending order
+
+        **Parameter Guidelines:**
+
+        - Use ``encoding_type="query"`` for search queries and
+          ``encoding_type="document"`` for indexed content to optimize
+          asymmetric retrieval tasks.
+        - For detailed API specifications, refer to:
+          https://help.aliyun.com/zh/model-studio/text-embedding-synchronous-api
+
+    Examples:
+        >>> # Basic usage for query embedding
+        >>> from zvec.extension import QwenSparseEmbedding
+        >>> import os
+        >>> os.environ["DASHSCOPE_API_KEY"] = "your-api-key"
+        >>>
+        >>> query_emb = QwenSparseEmbedding(dimension=1024, encoding_type="query")
+        >>> query_vec = query_emb.embed("machine learning")
+        >>> type(query_vec)
+        <class 'dict'>
+        >>> len(query_vec)  # Only non-zero dimensions
+        156
+
+        >>> # Document embedding
+        >>> doc_emb = QwenSparseEmbedding(dimension=1024, encoding_type="document")
+        >>> doc_vec = doc_emb.embed("Machine learning is a subset of AI")
+        >>> isinstance(doc_vec, dict)
+        True
+
+        >>> # Asymmetric retrieval example
+        >>> query_vec = query_emb.embed("what causes aging fast")
+        >>> doc_vec = doc_emb.embed(
+        ...     "UV-A light causes tanning, skin aging, and cataracts..."
+        ... )
+        >>>
+        >>> # Calculate similarity (dot product for sparse vectors)
+        >>> similarity = sum(
+        ...     query_vec.get(k, 0) * doc_vec.get(k, 0)
+        ...     for k in set(query_vec) | set(doc_vec)
+        ... )
+
+        >>> # Output is sorted by indices
+        >>> list(query_vec.items())[:5]  # First 5 dimensions (by index)
+        [(10, 0.45), (23, 0.87), (56, 0.32), (89, 1.12), (120, 0.65)]
+
+        >>> # Hybrid retrieval (combining dense + sparse)
+        >>> from zvec.extension import QwenDenseEmbedding
+        >>> dense_emb = QwenDenseEmbedding(dimension=1024)
+        >>> sparse_emb = QwenSparseEmbedding(dimension=1024)
+        >>>
+        >>> query = "deep learning neural networks"
+        >>> dense_vec = dense_emb.embed(query)   # [0.1, -0.3, 0.5, ...]
+        >>> sparse_vec = sparse_emb.embed(query)  # {12: 0.8, 45: 1.2, ...}
+
+        >>> # Error handling
+        >>> try:
+        ...     sparse_emb.embed("")  # Empty string
+        ... except ValueError as e:
+        ...     print(f"Error: {e}")
+        Error: Input text cannot be empty or whitespace only
+
+    See Also:
+        - ``SparseEmbeddingFunction``: Base class for sparse embeddings
+        - ``QwenDenseEmbedding``: Dense embedding using Qwen API
+        - ``DefaultSparseEmbedding``: Sparse embedding with SPLADE model
+    """
+
+    def __init__(
+        self,
+        dimension: int,
+        model: str = "text-embedding-v4",
+        api_key: Optional[str] = None,
+        **kwargs,
+    ):
+        """Initialize the Qwen sparse embedding function.
+
+        Args:
+            dimension (int): Target embedding dimension.
+            model (str): DashScope model name. Defaults to "text-embedding-v4".
+            api_key (Optional[str]): API key or None to use environment variable.
+            **kwargs: Additional DashScope API parameters. Supported options:
+                - ``encoding_type`` (Literal["query", "document"]): Encoding type.
+                  * ``"query"``: Optimize for search queries (default).
+                  * ``"document"``: Optimize for indexed documents.
+                  This distinction is important for asymmetric retrieval tasks.
+
+        Raises:
+            ValueError: If API key is not provided and not in environment.
+        """
+        # Initialize base class for API connection
+        QwenFunctionBase.__init__(self, model=model, api_key=api_key)
+
+        self._dimension = dimension
+        self._extra_params = kwargs
+
+    @property
+    def extra_params(self) -> dict:
+        """dict: Extra parameters for model-specific customization."""
+        return self._extra_params
+
+    def __call__(self, input: TEXT) -> SparseVectorType:
+        """Make the embedding function callable."""
+        return self.embed(input)
+
+    @lru_cache(maxsize=10)
+    def embed(self, input: TEXT) -> SparseVectorType:
+        """Generate sparse embedding vector for the input text.
+
+        This method calls the DashScope TextEmbedding API with sparse output type
+        to convert input text into a sparse vector representation. The result is
+        a dictionary where keys are dimension indices and values are importance
+        weights (only non-zero values included).
+
+        The embedding is optimized based on the ``encoding_type`` specified during
+        initialization: "query" for search queries or "document" for indexed content.
+
+        Args:
+            input (TEXT): Input text string to embed. Must be non-empty after
+                stripping whitespace. Maximum length depends on the model used
+                (typically 2048-8192 tokens).
+
+        Returns:
+            SparseVectorType: A dictionary mapping dimension index to weight.
+                Only non-zero dimensions are included. The dictionary is sorted
+                by indices (keys) in ascending order for consistent output.
+                Example: ``{10: 0.5, 245: 0.8, 1023: 1.2, 5678: 0.5}``
+
+        Raises:
+            TypeError: If ``input`` is not a string.
+            ValueError: If input is empty/whitespace-only, or if the API returns
+                an error or malformed response.
+            RuntimeError: If network connectivity issues or DashScope service
+                errors occur.
+
+        Examples:
+            >>> emb = QwenSparseEmbedding(dimension=1024, encoding_type="query")
+            >>> sparse_vec = emb.embed("machine learning")
+            >>> isinstance(sparse_vec, dict)
+            True
+            >>>
+            >>> # Verify sorted output
+            >>> keys = list(sparse_vec.keys())
+            >>> keys == sorted(keys)
+            True
+
+            >>> # Error: empty input
+            >>> emb.embed("   ")
+            ValueError: Input text cannot be empty or whitespace only
+
+            >>> # Error: non-string input
+            >>> emb.embed(123)
+            TypeError: Expected 'input' to be str, got int
+
+        Note:
+            - This method is cached (maxsize=10). Identical inputs return cached results.
+            - The cache is based on exact string match (case-sensitive).
+            - Output dictionary is always sorted by indices for consistency.
+        """
+        if not isinstance(input, TEXT):
+            raise TypeError(f"Expected 'input' to be str, got {type(input).__name__}")
+
+        input = input.strip()
+        if not input:
+            raise ValueError("Input text cannot be empty or whitespace only")
+
+        # Call API with sparse output type
+        output = self._call_text_embedding_api(
+            input=input,
+            dimension=self._dimension,
+            output_type="sparse",
+            text_type=self.extra_params.get("encoding_type", "query"),
+        )
+
+        embeddings = output.get("embeddings")
+        if not isinstance(embeddings, list):
+            raise ValueError(
+                "Invalid API response: 'embeddings' field is missing or not a list"
+            )
+
+        if len(embeddings) != 1:
+            raise ValueError(
+                f"Expected exactly 1 embedding in response, got {len(embeddings)}"
+            )
+
+        first_emb = embeddings[0]
+        if not isinstance(first_emb, dict):
+            raise ValueError("Invalid API response: embedding item is not a dictionary")
+
+        sparse_embedding = first_emb.get("sparse_embedding")
+        if not isinstance(sparse_embedding, list):
+            raise ValueError(
+                "Invalid API response: 'sparse_embedding' field is missing or not a list"
+            )
+
+        # Parse sparse embedding: convert array of {index, value, token} to dict
+        sparse_dict = {}
+        for item in sparse_embedding:
+            if not isinstance(item, dict):
+                raise ValueError(
+                    "Invalid API response: sparse_embedding item is not a dictionary"
+                )
+
+            index = item.get("index")
+            value = item.get("value")
+
+            if index is None or value is None:
+                raise ValueError(
+                    "Invalid API response: sparse_embedding item missing 'index' or 'value'"
+                )
+
+            # Convert to int and float, filter positive values
+            idx = int(index)
+            val = float(value)
+            if val > 0:
+                sparse_dict[idx] = val
+
+        # Sort by indices (keys) to ensure consistent ordering
+        return dict(sorted(sparse_dict.items()))
diff --git a/python/zvec/extension/qwen_function.py b/python/zvec/extension/qwen_function.py
new file mode 100644
index 00000000..b15ee4b1
--- /dev/null
+++ b/python/zvec/extension/qwen_function.py
@@ -0,0 +1,186 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import os
+from http import HTTPStatus
+from typing import Optional
+
+from ..common.constants import TEXT
+from ..tool import require_module
+
+
+class QwenFunctionBase:
+    """Base class for Qwen (DashScope) functions.
+
+    This base class provides common functionality for calling DashScope APIs
+    and handling responses. It supports embeddings (dense and sparse) and
+    re-ranking operations.
+
+    This class is not meant to be used directly. Use concrete implementations:
+    - ``QwenDenseEmbedding`` for dense embeddings
+    - ``QwenSparseEmbedding`` for sparse embeddings
+    - ``QwenReRanker`` for semantic re-ranking
+
+    Args:
+        model (str): DashScope model identifier.
+        api_key (Optional[str]): DashScope API authentication key.
+
+    Note:
+        - This is an internal base class for code reuse across Qwen features
+        - Subclasses should inherit from appropriate Protocol/ABC
+        - Provides unified API connection and response handling
+    """
+
+    def __init__(
+        self,
+        model: str,
+        api_key: Optional[str] = None,
+    ):
+        """Initialize the base Qwen embedding functionality.
+
+        Args:
+            model (str): DashScope model name.
+            api_key (Optional[str]): API key or None to use environment variable.
+
+        Raises:
+            ValueError: If API key is not provided and not in environment.
+        """
+        self._model = model
+        self._api_key = api_key or os.environ.get("DASHSCOPE_API_KEY")
+        if not self._api_key:
+            raise ValueError(
+                "DashScope API key is required. Please provide 'api_key' parameter "
+                "or set the 'DASHSCOPE_API_KEY' environment variable."
+            )
+
+    @property
+    def model(self) -> str:
+        """str: The DashScope embedding model name currently in use."""
+        return self._model
+
+    def _get_connection(self):
+        """Establish connection to DashScope API.
+
+        Returns:
+            module: The dashscope module with API key configured.
+
+        Raises:
+            ImportError: If dashscope package is not installed.
+        """
+        dashscope = require_module("dashscope")
+        dashscope.api_key = self._api_key
+        return dashscope
+
+    def _call_text_embedding_api(
+        self,
+        input: TEXT,
+        dimension: int,
+        output_type: str,
+        text_type: Optional[str] = None,
+    ) -> dict:
+        """Call DashScope TextEmbedding API.
+
+        Args:
+            input (TEXT): Input text to embed.
+            dimension (int): Target embedding dimension.
+            output_type (str): Output type ("dense" or "sparse").
+            text_type (Optional[str]): Text type ("query" or "document").
+
+        Returns:
+            dict: API response output field.
+
+        Raises:
+            RuntimeError: If API call fails.
+            ValueError: If API returns error response.
+        """
+        try:
+            # Prepare API call parameters
+            call_params = {
+                "model": self.model,
+                "input": input,
+                "dimension": dimension,
+                "output_type": output_type,
+            }
+
+            # Add optional text_type parameter if provided
+            if text_type is not None:
+                call_params["text_type"] = text_type
+
+            resp = self._get_connection().TextEmbedding.call(**call_params)
+        except Exception as e:
+            raise RuntimeError(f"Failed to call DashScope API: {e!s}") from e
+
+        if resp.status_code != HTTPStatus.OK:
+            error_msg = getattr(resp, "message", "Unknown error")
+            error_code = getattr(resp, "code", "N/A")
+            raise ValueError(
+                f"DashScope API error: [Code={error_code}, "
+                f"Status={resp.status_code}] {error_msg}"
+            )
+
+        output = getattr(resp, "output", None)
+        if not isinstance(output, dict):
+            raise ValueError(
+                "Invalid API response: missing or malformed 'output' field"
+            )
+
+        return output
+
+    def _call_rerank_api(
+        self,
+        query: str,
+        documents: list[str],
+        top_n: int,
+    ) -> dict:
+        """Call DashScope TextReRank API.
+
+        Args:
+            query (str): Query text for semantic matching.
+            documents (list[str]): List of document texts to re-rank.
+            top_n (int): Maximum number of documents to return.
+
+        Returns:
+            dict: API response output field containing re-ranked results.
+
+        Raises:
+            RuntimeError: If API call fails.
+            ValueError: If API returns error response.
+        """
+        try:
+            resp = self._get_connection().TextReRank.call(
+                model=self.model,
+                query=query,
+                documents=documents,
+                top_n=top_n,
+                return_documents=False,
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to call DashScope API: {e!s}") from e
+
+        if resp.status_code != HTTPStatus.OK:
+            error_msg = getattr(resp, "message", "Unknown error")
+            error_code = getattr(resp, "code", "N/A")
+            raise ValueError(
+                f"DashScope API error: [Code={error_code}, "
+                f"Status={resp.status_code}] {error_msg}"
+            )
+
+        output = getattr(resp, "output", None)
+        if not isinstance(output, dict):
+            raise ValueError(
+                "Invalid API response: missing or malformed 'output' field"
+            )
+
+        return output
diff --git a/python/zvec/extension/qwen_rerank_function.py b/python/zvec/extension/qwen_rerank_function.py
new file mode 100644
index 00000000..9b4a66b3
--- /dev/null
+++ b/python/zvec/extension/qwen_rerank_function.py
@@ -0,0 +1,162 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import Optional
+
+from ..model.doc import Doc
+from .qwen_function import QwenFunctionBase
+from .rerank_function import RerankFunction
+
+
+class QwenReRanker(QwenFunctionBase, RerankFunction):
+    """Re-ranker using Qwen (DashScope) cross-encoder API for semantic re-ranking.
+
+    This re-ranker leverages DashScope's TextReRank service to perform
+    cross-encoder style re-ranking. It sends query and document pairs to the
+    API and receives relevance scores based on deep semantic understanding.
+
+    The re-ranker is suitable for single-vector or multi-vector search scenarios
+    where semantic relevance to a specific query is required.
+
+    Args:
+        query (str): Query text for semantic re-ranking. **Required**.
+        topn (int, optional): Maximum number of documents to return after re-ranking.
+            Defaults to 10.
+        rerank_field (str): Document field name to use as re-ranking input text.
+            **Required** (e.g., "content", "title", "body").
+        model (str, optional): DashScope re-ranking model identifier.
+            Defaults to ``"gte-rerank-v2"``.
+        api_key (Optional[str], optional): DashScope API authentication key.
+            If not provided, reads from ``DASHSCOPE_API_KEY`` environment variable.
+
+    Raises:
+        ValueError: If ``query`` is empty/None, ``rerank_field`` is None,
+            or API key is not available.
+
+    Note:
+        - Requires ``dashscope`` Python package installed
+        - Documents without valid content in ``rerank_field`` are skipped
+        - API rate limits and quotas apply per DashScope subscription
+
+    Example:
+        >>> reranker = QwenReRanker(
+        ...     query="machine learning algorithms",
+        ...     topn=5,
+        ...     rerank_field="content",
+        ...     model="gte-rerank-v2",
+        ...     api_key="your-api-key"
+        ... )
+        >>> # Use in collection.query(reranker=reranker)
+    """
+
+    def __init__(
+        self,
+        query: Optional[str] = None,
+        topn: int = 10,
+        rerank_field: Optional[str] = None,
+        model: str = "gte-rerank-v2",
+        api_key: Optional[str] = None,
+    ):
+        """Initialize QwenReRanker with query and configuration.
+
+        Args:
+            query (Optional[str]): Query text for semantic matching. Required.
+            topn (int): Number of top results to return.
+            rerank_field (Optional[str]): Document field for re-ranking input.
+            model (str): DashScope model name.
+            api_key (Optional[str]): API key or None to use environment variable.
+
+        Raises:
+            ValueError: If query is empty or API key is unavailable.
+        """
+        QwenFunctionBase.__init__(self, model=model, api_key=api_key)
+        RerankFunction.__init__(self, topn=topn, rerank_field=rerank_field)
+
+        if not query:
+            raise ValueError("Query is required for QwenReRanker")
+        self._query = query
+
+    @property
+    def query(self) -> str:
+        """str: Query text used for semantic re-ranking."""
+        return self._query
+
+    def rerank(self, query_results: dict[str, list[Doc]]) -> list[Doc]:
+        """Re-rank documents using Qwen's TextReRank API.
+
+        Sends document texts to DashScope TextReRank service along with the query.
+        Returns documents sorted by relevance scores from the cross-encoder model.
+
+        Args:
+            query_results (dict[str, list[Doc]]): Mapping from vector field names
+                to lists of retrieved documents. Documents from all fields are
+                deduplicated and re-ranked together.
+
+        Returns:
+            list[Doc]: Re-ranked documents (up to ``topn``) with updated ``score``
+                fields containing relevance scores from the API.
+
+        Raises:
+            ValueError: If no valid documents are found or API call fails.
+
+        Note:
+            - Duplicate documents (same ID) across fields are processed once
+            - Documents with empty/missing ``rerank_field`` content are skipped
+            - Returned scores are relevance scores from the cross-encoder model
+        """
+        if not query_results:
+            return []
+
+        # Collect and deduplicate documents
+        id_to_doc: dict[str, Doc] = {}
+        doc_ids: list[str] = []
+        contents: list[str] = []
+
+        for _, query_result in query_results.items():
+            for doc in query_result:
+                doc_id = doc.id
+                if doc_id in id_to_doc:
+                    continue
+
+                # Extract text content from specified field
+                field_value = doc.field(self.rerank_field)
+                rank_content = str(field_value).strip() if field_value else ""
+                if not rank_content:
+                    continue
+
+                id_to_doc[doc_id] = doc
+                doc_ids.append(doc_id)
+                contents.append(rank_content)
+
+        if not contents:
+            raise ValueError("No documents to rerank")
+
+        # Call DashScope TextReRank API
+        output = self._call_rerank_api(
+            query=self.query,
+            documents=contents,
+            top_n=self.topn,
+        )
+
+        # Build result list with updated scores
+        results: list[Doc] = []
+        for item in output["results"]:
+            idx = item["index"]
+            doc_id = doc_ids[idx]
+            doc = id_to_doc[doc_id]
+            new_doc = doc._replace(score=item["relevance_score"])
+            results.append(new_doc)
+
+        return results
diff --git a/python/zvec/extension/rerank.py b/python/zvec/extension/rerank.py
deleted file mode 100644
index 021f6ed4..00000000
--- a/python/zvec/extension/rerank.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# Copyright 2025-present the zvec project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import heapq
-import math
-import os
-from abc import ABC, abstractmethod
-from collections import defaultdict
-from http import HTTPStatus
-from typing import Optional
-
-from ..model.doc import Doc
-from ..tool import require_module
-from ..typing import MetricType
-
-
-class ReRanker(ABC):
-    """Abstract base class for re-ranking search results.
-
-    Re-rankers refine the output of one or more vector queries by applying
-    a secondary scoring strategy. They are used in the ``query()`` method of
-    ``Collection`` via the ``reranker`` parameter.
-
-    Args:
-        query (Optional[str], optional): Query text used for re-ranking.
-            Required for LLM-based re-rankers. Defaults to None.
-        topn (int, optional): Number of top documents to return after re-ranking.
-            Defaults to 10.
-        rerank_field (Optional[str], optional): Field name used as input for
-            re-ranking (e.g., document title or body). Defaults to None.
-
-    Note:
-        Subclasses must implement the ``rerank()`` method.
-    """
-
-    def __init__(
-        self,
-        query: Optional[str] = None,
-        topn: int = 10,
-        rerank_field: Optional[str] = None,
-    ):
-        self._query = query
-        self._topn = topn
-        self._rerank_field = rerank_field
-
-    @property
-    def topn(self) -> int:
-        """int: Number of top documents to return after re-ranking."""
-        return self._topn
-
-    @property
-    def query(self) -> str:
-        """str: Query text used for re-ranking."""
-        return self._query
-
-    @property
-    def rerank_field(self) -> Optional[str]:
-        """Optional[str]: Field name used as re-ranking input."""
-        return self._rerank_field
-
-    @abstractmethod
-    def rerank(self, query_results: dict[str, list[Doc]]) -> list[Doc]:
-        """Re-rank documents from one or more vector queries.
-
-        Args:
-            query_results (dict[str, list[Doc]]): Mapping from vector field name
-                to list of retrieved documents (sorted by relevance).
-
-        Returns:
-            list[Doc]: Re-ranked list of documents (length ≤ ``topn``),
-                with updated ``score`` fields.
-        """
-        raise NotImplementedError
-
-
-class RrfReRanker(ReRanker):
-    """Re-ranker using Reciprocal Rank Fusion (RRF).
-
-    RRF combines results from multiple queries without requiring relevance scores.
-    It assigns higher weight to documents that appear early in multiple result lists.
-
-    The RRF score for a document at rank ``r`` is: ``1 / (k + r + 1)``,
-    where ``k`` is the rank constant.
-
-    Args:
-        query (Optional[str], optional): Ignored by RRF. Defaults to None.
-        topn (int, optional): Number of top documents to return. Defaults to 10.
-        rerank_field (Optional[str], optional): Ignored by RRF. Defaults to None.
-        rank_constant (int, optional): Smoothing constant ``k`` in RRF formula.
-            Larger values reduce the impact of early ranks. Defaults to 60.
-    """
-
-    def __init__(
-        self,
-        query: Optional[str] = None,
-        topn: int = 10,
-        rerank_field: Optional[str] = None,
-        rank_constant: int = 60,
-    ):
-        super().__init__(query, topn, rerank_field)
-        self._rank_constant = rank_constant
-
-    @property
-    def rank_constant(self) -> int:
-        return self._rank_constant
-
-    def _rrf_score(self, rank: int):
-        return 1.0 / (self._rank_constant + rank + 1)
-
-    def rerank(self, query_results: dict[str, list[Doc]]) -> list[Doc]:
-        """Apply Reciprocal Rank Fusion to combine multiple query results.
-
-        Args:
-            query_results (dict[str, list[Doc]]): Results from one or more vector queries.
-
-        Returns:
-            list[Doc]: Re-ranked documents with RRF scores in the ``score`` field.
-        """
-        rrf_scores: dict[str, float] = defaultdict(float)
-        id_to_doc: dict[str, Doc] = {}
-
-        for _, query_result in query_results.items():
-            for rank, doc in enumerate(query_result):
-                doc_id = doc.id
-                rrf_score = self._rrf_score(rank)
-                rrf_scores[doc_id] += rrf_score
-                if doc_id not in id_to_doc:
-                    id_to_doc[doc_id] = doc
-
-        top_docs = heapq.nlargest(self.topn, rrf_scores.items(), key=lambda x: x[1])
-        results = []
-        for doc_id, rrf_score in top_docs:
-            doc = id_to_doc[doc_id]
-            new_doc = doc._replace(score=rrf_score)
-            results.append(new_doc)
-        return results
-
-
-class WeightedReRanker(ReRanker):
-    """Re-ranker that combines scores from multiple vector fields using weights.
-
-    Each vector field's relevance score is normalized based on its metric type,
-    then scaled by a user-provided weight. Final scores are summed across fields.
-
-    Args:
-        query (Optional[str], optional): Ignored. Defaults to None.
-        topn (int, optional): Number of top documents to return. Defaults to 10.
-        rerank_field (Optional[str], optional): Ignored. Defaults to None.
-        metric (MetricType, optional): Distance metric used for score normalization.
-            Defaults to ``MetricType.L2``.
-        weights (Optional[dict[str, float]], optional): Weight per vector field.
-            Fields not listed use weight 1.0. Defaults to None.
-
-    Note:
-        Supported metrics: L2, IP, COSINE. Scores are normalized to [0, 1].
-    """
-
-    def __init__(
-        self,
-        query: Optional[str] = None,
-        topn: int = 10,
-        rerank_field: Optional[str] = None,
-        metric: MetricType = MetricType.L2,
-        weights: Optional[dict[str, float]] = None,
-    ):
-        super().__init__(query, topn, rerank_field)
-        self._weights = weights
-        self._metric = metric
-
-    @property
-    def weights(self) -> dict[str, float]:
-        """dict[str, float]: Weight mapping for vector fields."""
-        return self._weights
-
-    @property
-    def metric(self) -> MetricType:
-        """MetricType: Distance metric used for score normalization."""
-        return self._metric
-
-    def rerank(self, query_results: dict[str, list[Doc]]) -> list[Doc]:
-        """Combine scores from multiple vector fields using weighted sum.
-
-        Args:
-            query_results (dict[str, list[Doc]]): Results per vector field.
-
-        Returns:
-            list[Doc]: Re-ranked documents with combined scores in ``score`` field.
-        """
-        weighted_scores: dict[str, float] = defaultdict(float)
-        id_to_doc: dict[str, Doc] = {}
-
-        for vector_name, query_result in query_results.items():
-            for _, doc in enumerate(query_result):
-                doc_id = doc.id
-                weighted_score = self._normalize_score(
-                    doc.score, self.metric
-                ) * self.weights.get(vector_name, 1.0)
-                weighted_scores[doc_id] += weighted_score
-                if doc_id not in id_to_doc:
-                    id_to_doc[doc_id] = doc
-
-        top_docs = heapq.nlargest(
-            self.topn, weighted_scores.items(), key=lambda x: x[1]
-        )
-        results = []
-        for doc_id, weighted_score in top_docs:
-            doc = id_to_doc[doc_id]
-            new_doc = doc._replace(score=weighted_score)
-            results.append(new_doc)
-        return results
-
-    def _normalize_score(self, score: float, metric: MetricType) -> float:
-        if metric == MetricType.L2:
-            return 1.0 - 2 * math.atan(score) / math.pi
-        if metric == MetricType.IP:
-            return 0.5 + math.atan(score) / math.pi
-        if metric == MetricType.COSINE:
-            return 1.0 - score / 2.0
-        raise ValueError("Unsupported metric type")
-
-
-class QwenReRanker(ReRanker):
-    """Re-ranker using Qwen (DashScope) LLM-based re-ranking API.
-
-    This re-ranker sends documents to the DashScope TextReRank service for
-    cross-encoder style re-ranking based on semantic relevance to the query.
-
-    Args:
-        query (str): Query text for semantic re-ranking. **Required**.
-        topn (int, optional): Number of top documents to return. Defaults to 10.
-        rerank_field (str): Field name containing document text for re-ranking.
-            **Required**.
-        model (str, optional): DashScope re-ranking model name.
-            Defaults to ``"gte-rerank-v2"``.
-        api_key (Optional[str], optional): DashScope API key. If not provided,
-            reads from ``DASHSCOPE_API_KEY`` environment variable.
-
-    Raises:
-        ValueError: If ``query`` is missing, ``rerank_field`` is missing,
-            or API key is not provided.
-
-    Note:
-        Requires the ``dashscope`` Python package.
-        Documents without content in ``rerank_field`` are skipped.
-    """
-
-    def __init__(
-        self,
-        query: Optional[str] = None,
-        topn: int = 10,
-        rerank_field: Optional[str] = None,
-        model: str = "gte-rerank-v2",
-        api_key: Optional[str] = None,
-    ):
-        super().__init__(query, topn, rerank_field)
-        if not query:
-            raise ValueError("Query is required for reranking")
-        self._model = model
-        self._api_key = api_key or os.environ.get("DASHSCOPE_API_KEY")
-        if not self._api_key:
-            raise ValueError("DashScope API key is required")
-
-    @property
-    def model(self) -> str:
-        """str: DashScope re-ranking model name."""
-        return self._model
-
-    def _connection(self):
-        dashscope = require_module("dashscope")
-        dashscope.api_key = self._api_key
-        return dashscope
-
-    def rerank(self, query_results: dict[str, list[Doc]]) -> list[Doc]:
-        """Re-rank documents using Qwen's TextReRank API.
-
-        Args:
-            query_results (dict[str, list[Doc]]): Results from vector search.
-
-        Returns:
-            list[Doc]: Re-ranked documents with relevance scores from Qwen.
-
-        Raises:
-            ValueError: If API call fails or no valid documents are found.
-        """
-        if not query_results:
-            return []
-
-        id_to_doc: dict[str, Doc] = {}
-        doc_ids = []
-        contents = []
-
-        for _, query_result in query_results.items():
-            for doc in query_result:
-                doc_id = doc.id
-                if doc_id in id_to_doc:
-                    continue
-
-                field_value = doc.field(self.rerank_field)
-                rank_content = str(field_value).strip() if field_value else ""
-                if not rank_content:
-                    continue
-
-                id_to_doc[doc_id] = doc
-                doc_ids.append(doc_id)
-                contents.append(rank_content)
-
-        if not contents:
-            raise ValueError("No documents to rerank")
-
-        resp = self._connection().TextReRank.call(
-            model=self.model,
-            query=self.query,
-            documents=list(contents),
-            top_n=self.topn,
-            return_documents=False,
-        )
-
-        if resp.status_code != HTTPStatus.OK:
-            raise ValueError(
-                f"QwenReranker failed with status {resp.status_code}: {resp.message}"
-            )
-
-        results = []
-        for item in resp.output.results:
-            idx = item.index
-            doc_id = doc_ids[idx]
-            doc = id_to_doc[doc_id]
-            new_doc = doc._replace(score=item.relevance_score)
-            results.append(new_doc)
-
-        return results
diff --git a/python/zvec/extension/rerank_function.py b/python/zvec/extension/rerank_function.py
new file mode 100644
index 00000000..c558a2bc
--- /dev/null
+++ b/python/zvec/extension/rerank_function.py
@@ -0,0 +1,69 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+from ..model.doc import Doc
+
+
+class RerankFunction(ABC):
+    """Abstract base class for re-ranking search results.
+
+    Re-rankers refine the output of one or more vector queries by applying
+    a secondary scoring strategy. They are used in the ``query()`` method of
+    ``Collection`` via the ``reranker`` parameter.
+
+    Args:
+        topn (int, optional): Number of top documents to return after re-ranking.
+            Defaults to 10.
+        rerank_field (Optional[str], optional): Field name used as input for
+            re-ranking (e.g., document title or body). Defaults to None.
+
+    Note:
+        Subclasses must implement the ``rerank()`` method.
+    """
+
+    def __init__(
+        self,
+        topn: int = 10,
+        rerank_field: Optional[str] = None,
+    ):
+        self._topn = topn
+        self._rerank_field = rerank_field
+
+    @property
+    def topn(self) -> int:
+        """int: Number of top documents to return after re-ranking."""
+        return self._topn
+
+    @property
+    def rerank_field(self) -> Optional[str]:
+        """Optional[str]: Field name used as re-ranking input."""
+        return self._rerank_field
+
+    @abstractmethod
+    def rerank(self, query_results: dict[str, list[Doc]]) -> list[Doc]:
+        """Re-rank documents from one or more vector queries.
+
+        Args:
+            query_results (dict[str, list[Doc]]): Mapping from vector field name
+                to list of retrieved documents (sorted by relevance).
+
+        Returns:
+            list[Doc]: Re-ranked list of documents (length ≤ ``topn``),
+                with updated ``score`` fields.
+        """
+        ...
diff --git a/python/zvec/extension/sentence_transformer_embedding_function.py b/python/zvec/extension/sentence_transformer_embedding_function.py
new file mode 100644
index 00000000..032f02e0
--- /dev/null
+++ b/python/zvec/extension/sentence_transformer_embedding_function.py
@@ -0,0 +1,839 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import ClassVar, Literal, Optional
+
+import numpy as np
+
+from ..common.constants import TEXT, DenseVectorType, SparseVectorType
+from .embedding_function import DenseEmbeddingFunction, SparseEmbeddingFunction
+from .sentence_transformer_function import SentenceTransformerFunctionBase
+
+
+class DefaultLocalDenseEmbedding(
+    SentenceTransformerFunctionBase, DenseEmbeddingFunction[TEXT]
+):
+    """Default local dense embedding using all-MiniLM-L6-v2 model.
+
+    This is the default implementation for dense text embedding that uses the
+    ``all-MiniLM-L6-v2`` model from Hugging Face by default. This model provides
+    a good balance between speed and quality for general-purpose text embedding.
+
+    The class provides text-to-vector dense embedding capabilities using the
+    sentence-transformers library. It supports models from Hugging Face Hub and
+    ModelScope, runs locally without API calls, and supports CPU/GPU acceleration.
+
+    The model produces 384-dimensional embeddings and is optimized for semantic
+    similarity tasks. It runs locally without requiring API keys.
+
+    Args:
+        model_source (Literal["huggingface", "modelscope"], optional): Model source.
+            - ``"huggingface"``: Use Hugging Face Hub (default, for international users)
+            - ``"modelscope"``: Use ModelScope (recommended for users in China)
+            Defaults to ``"huggingface"``.
+        device (Optional[str], optional): Device to run the model on.
+            Options: ``"cpu"``, ``"cuda"``, ``"mps"`` (for Apple Silicon), or ``None``
+            for automatic detection. Defaults to ``None``.
+        normalize_embeddings (bool, optional): Whether to normalize embeddings to
+            unit length (L2 normalization). Useful for cosine similarity.
+            Defaults to ``True``.
+        batch_size (int, optional): Batch size for encoding. Defaults to ``32``.
+        **kwargs: Additional parameters for future extension.
+
+    Attributes:
+        dimension (int): Always 384 for both models.
+        model_name (str): "all-MiniLM-L6-v2" (HF) or "iic/nlp_gte_sentence-embedding_chinese-small" (MS).
+        model_source (str): The model source being used.
+        device (str): The device the model is running on.
+
+    Raises:
+        ValueError: If the model cannot be loaded or input is invalid.
+        TypeError: If input to ``embed()`` is not a string.
+        RuntimeError: If model inference fails.
+
+    Note:
+        - Requires Python 3.10, 3.11, or 3.12
+        - Requires the ``sentence-transformers`` package:
+          ``pip install sentence-transformers``
+        - For ModelScope, also requires: ``pip install modelscope``
+        - First run downloads the model (~50-80MB) from chosen source
+        - Hugging Face cache: ``~/.cache/torch/sentence_transformers/``
+        - ModelScope cache: ``~/.cache/modelscope/hub/``
+        - No API keys or network required after initial download
+        - Inference speed: ~1000 sentences/sec on CPU, ~10000 on GPU
+
+        **For users in China:**
+
+        If you encounter Hugging Face access issues, use ModelScope instead:
+
+        .. code-block:: python
+
+            # Recommended for users in China
+            emb = DefaultLocalDenseEmbedding(model_source="modelscope")
+
+        Alternatively, use Hugging Face mirror:
+
+        .. code-block:: bash
+
+            export HF_ENDPOINT=https://hf-mirror.com
+            # Then use default Hugging Face mode
+
+    Examples:
+        >>> # Basic usage with Hugging Face (default)
+        >>> from zvec.extension import DefaultLocalDenseEmbedding
+        >>>
+        >>> emb_func = DefaultLocalDenseEmbedding()
+        >>> vector = emb_func.embed("Hello, world!")
+        >>> len(vector)
+        384
+        >>> isinstance(vector, list)
+        True
+
+        >>> # Recommended for users in China (uses ModelScope)
+        >>> emb_func = DefaultLocalDenseEmbedding(model_source="modelscope")
+        >>> vector = emb_func.embed("你好，世界！")  # Works well with Chinese text
+        >>> len(vector)
+        384
+
+        >>> # Alternative for China users: Use Hugging Face mirror
+        >>> import os
+        >>> os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+        >>> emb_func = DefaultLocalDenseEmbedding()  # Uses HF mirror
+        >>> vector = emb_func.embed("Hello, world!")
+
+        >>> # Using GPU for faster inference
+        >>> emb_func = DefaultLocalDenseEmbedding(device="cuda")
+        >>> vector = emb_func("Machine learning is fascinating")
+        >>> # Normalized vector has unit length
+        >>> import numpy as np
+        >>> np.linalg.norm(vector)
+        1.0
+
+        >>> # Batch processing
+        >>> texts = ["First text", "Second text", "Third text"]
+        >>> vectors = [emb_func.embed(text) for text in texts]
+        >>> len(vectors)
+        3
+        >>> all(len(v) == 384 for v in vectors)
+        True
+
+        >>> # Semantic similarity
+        >>> v1 = emb_func.embed("The cat sits on the mat")
+        >>> v2 = emb_func.embed("A feline rests on a rug")
+        >>> v3 = emb_func.embed("Python programming")
+        >>> similarity_high = np.dot(v1, v2)  # Similar sentences
+        >>> similarity_low = np.dot(v1, v3)   # Different topics
+        >>> similarity_high > similarity_low
+        True
+
+        >>> # Error handling
+        >>> try:
+        ...     emb_func.embed("")  # Empty string
+        ... except ValueError as e:
+        ...     print(f"Error: {e}")
+        Error: Input text cannot be empty or whitespace only
+
+    See Also:
+        - ``DenseEmbeddingFunction``: Base class for dense embeddings
+        - ``DefaultLocalSparseEmbedding``: Sparse embedding with SPLADE
+        - ``QwenDenseEmbedding``: Alternative using Qwen API
+    """
+
+    def __init__(
+        self,
+        model_source: Literal["huggingface", "modelscope"] = "huggingface",
+        device: Optional[str] = None,
+        normalize_embeddings: bool = True,
+        batch_size: int = 32,
+        **kwargs,
+    ):
+        """Initialize with all-MiniLM-L6-v2 model.
+
+        Args:
+            model_source (Literal["huggingface", "modelscope"]): Model source.
+                Defaults to "huggingface".
+            device (Optional[str]): Target device ("cpu", "cuda", "mps", or None).
+                Defaults to None (automatic detection).
+            normalize_embeddings (bool): Whether to L2-normalize output vectors.
+                Defaults to True.
+            batch_size (int): Batch size for encoding. Defaults to 32.
+            **kwargs: Additional parameters for future extension.
+
+        Raises:
+            ImportError: If sentence-transformers or modelscope is not installed.
+            ValueError: If model cannot be loaded.
+        """
+        # Use different models based on source
+        if model_source == "modelscope":
+            # Use Chinese-optimized model for ModelScope (better for Chinese text)
+            model_name = "iic/nlp_gte_sentence-embedding_chinese-small"
+        else:
+            model_name = "all-MiniLM-L6-v2"
+
+        # Initialize base class for model loading
+        SentenceTransformerFunctionBase.__init__(
+            self, model_name=model_name, model_source=model_source, device=device
+        )
+
+        self._normalize_embeddings = normalize_embeddings
+        self._batch_size = batch_size
+
+        # Load model and get dimension
+        model = self._get_model()
+        self._dimension = model.get_sentence_embedding_dimension()
+
+        # Store extra parameters
+        self._extra_params = kwargs
+
+    @property
+    def dimension(self) -> int:
+        """int: The expected dimensionality of the embedding vector."""
+        return self._dimension
+
+    @property
+    def extra_params(self) -> dict:
+        """dict: Extra parameters for model-specific customization."""
+        return self._extra_params
+
+    def __call__(self, input: str) -> DenseVectorType:
+        """Make the embedding function callable."""
+        return self.embed(input)
+
+    def embed(self, input: str) -> DenseVectorType:
+        """Generate dense embedding vector for the input text.
+
+        This method uses the Sentence Transformer model to convert input text
+        into a dense vector representation. The model runs locally without
+        requiring API calls.
+
+        Args:
+            input (str): Input text string to embed. Must be non-empty after
+                stripping whitespace. Maximum length depends on the model used
+                (typically 128-512 tokens for most models).
+
+        Returns:
+            DenseVectorType: A list of floats representing the embedding vector.
+                Length equals ``self.dimension``. If ``normalize_embeddings=True``,
+                the vector has unit length. Example:
+                ``[0.123, -0.456, 0.789, ...]``
+
+        Raises:
+            TypeError: If ``input`` is not a string.
+            ValueError: If input is empty or whitespace-only.
+            RuntimeError: If model inference fails.
+
+        Examples:
+            >>> emb = DefaultLocalDenseEmbedding()
+            >>> vector = emb.embed("Natural language processing")
+            >>> len(vector)
+            384
+            >>> isinstance(vector[0], float)
+            True
+
+            >>> # Normalized vectors have unit length
+            >>> import numpy as np
+            >>> emb = DefaultLocalDenseEmbedding(normalize_embeddings=True)
+            >>> vector = emb.embed("Test sentence")
+            >>> np.linalg.norm(vector)
+            1.0
+
+            >>> # Error: empty input
+            >>> emb.embed("   ")
+            ValueError: Input text cannot be empty or whitespace only
+
+            >>> # Error: non-string input
+            >>> emb.embed(123)
+            TypeError: Expected 'input' to be str, got int
+
+            >>> # Semantic similarity example
+            >>> v1 = emb.embed("The cat sits on the mat")
+            >>> v2 = emb.embed("A feline rests on a rug")
+            >>> similarity = np.dot(v1, v2)  # High similarity due to semantic meaning
+            >>> similarity > 0.7
+            True
+
+        Note:
+            - First call may be slower due to model loading
+            - Subsequent calls are much faster as the model stays in memory
+            - For batch processing, consider encoding multiple texts together
+              (though this method handles single texts only)
+            - GPU acceleration provides 5-10x speedup over CPU
+        """
+        if not isinstance(input, str):
+            raise TypeError(f"Expected 'input' to be str, got {type(input).__name__}")
+
+        input = input.strip()
+        if not input:
+            raise ValueError("Input text cannot be empty or whitespace only")
+
+        try:
+            model = self._get_model()
+            embedding = model.encode(
+                input,
+                convert_to_numpy=True,
+                normalize_embeddings=self._normalize_embeddings,
+                batch_size=self._batch_size,
+            )
+
+            # Convert numpy array to list
+            if isinstance(embedding, np.ndarray):
+                embedding_list = embedding.tolist()
+            else:
+                embedding_list = list(embedding)
+
+            # Validate dimension
+            if len(embedding_list) != self.dimension:
+                raise ValueError(
+                    f"Dimension mismatch: expected {self.dimension}, "
+                    f"got {len(embedding_list)}"
+                )
+
+            return embedding_list
+
+        except Exception as e:
+            if isinstance(e, (TypeError, ValueError)):
+                raise
+            raise RuntimeError(f"Failed to generate embedding: {e!s}") from e
+
+
+class DefaultLocalSparseEmbedding(
+    SentenceTransformerFunctionBase, SparseEmbeddingFunction[TEXT]
+):
+    """Default local sparse embedding using SPLADE model.
+
+    This class provides sparse vector embedding using the SPLADE (SParse Lexical
+    AnD Expansion) model. SPLADE generates sparse, interpretable representations
+    where each dimension corresponds to a vocabulary term with learned importance
+    weights. It's ideal for lexical matching, BM25-style retrieval, and hybrid
+    search scenarios.
+
+    The default model is ``naver/splade-cocondenser-ensembledistil``, which is
+    publicly available without authentication. It produces sparse vectors with
+    thousands of dimensions but only hundreds of non-zero values, making them
+    efficient for storage and retrieval while maintaining strong lexical matching.
+
+    **Model Caching:**
+
+    This class uses class-level caching to share the SPLADE model across all instances
+    with the same configuration (model_source, device). This significantly reduces
+    memory usage when creating multiple instances for different encoding types
+    (query vs document).
+
+    **Cache Management:**
+
+    The class provides methods to manage the model cache:
+
+    - ``clear_cache()``: Clear all cached models to free memory
+    - ``get_cache_info()``: Get information about cached models
+    - ``remove_from_cache(model_source, device)``: Remove a specific model from cache
+
+    .. note::
+        **Why not use splade-v3?**
+
+        The newer ``naver/splade-v3`` model is gated (requires access approval).
+        We use ``naver/splade-cocondenser-ensembledistil`` instead.
+
+        **To use splade-v3 (if you have access):**
+
+        1. Request access at https://huggingface.co/naver/splade-v3
+        2. Get your Hugging Face token from https://huggingface.co/settings/tokens
+        3. Set environment variable:
+
+           .. code-block:: bash
+
+               export HF_TOKEN="your_huggingface_token"
+
+        4. Or login programmatically:
+
+           .. code-block:: python
+
+               from huggingface_hub import login
+               login(token="your_huggingface_token")
+
+        5. To use a custom SPLADE model, you can subclass this class and override
+           the model_name in ``__init__``, or create your own implementation
+           inheriting from ``SentenceTransformerFunctionBase`` and
+           ``SparseEmbeddingFunction``.
+
+    Args:
+        model_source (Literal["huggingface", "modelscope"], optional): Model source.
+            Defaults to ``"huggingface"``. ModelScope support may vary for SPLADE models.
+        device (Optional[str], optional): Device to run the model on.
+            Options: ``"cpu"``, ``"cuda"``, ``"mps"`` (for Apple Silicon), or ``None``
+            for automatic detection. Defaults to ``None``.
+        encoding_type (Literal["query", "document"], optional): Encoding type.
+            - ``"query"``: Optimize for search queries (default)
+            - ``"document"``: Optimize for indexed documents
+        **kwargs: Additional parameters (currently unused, for future extension).
+
+    Attributes:
+        model_name (str): Model identifier.
+        model_source (str): The model source being used.
+        device (str): The device the model is running on.
+
+    Raises:
+        ValueError: If the model cannot be loaded or input is invalid.
+        TypeError: If input to ``embed()`` is not a string.
+        RuntimeError: If model inference fails.
+
+    Note:
+        - Requires Python 3.10, 3.11, or 3.12
+        - Requires the ``sentence-transformers`` package:
+          ``pip install sentence-transformers``
+        - First run downloads the model (~100MB) from Hugging Face
+        - Cache location: ``~/.cache/torch/sentence_transformers/``
+        - No API keys or authentication required
+        - Sparse vectors have ~30k dimensions but only ~100-200 non-zero values
+        - Best combined with dense embeddings for hybrid retrieval
+
+        **SPLADE vs Dense Embeddings:**
+
+        - **Dense**: Continuous semantic vectors, good for semantic similarity
+        - **Sparse**: Lexical keyword-based, interpretable, good for exact matching
+        - **Hybrid**: Combine both for best retrieval performance
+
+    Examples:
+        >>> # Memory-efficient: both instances share the same model (~200MB)
+        >>> from zvec.extension import DefaultLocalSparseEmbedding
+        >>>
+        >>> # Query embedding
+        >>> query_emb = DefaultLocalSparseEmbedding(encoding_type="query")
+        >>> query_vec = query_emb.embed("machine learning algorithms")
+        >>> type(query_vec)
+        <class 'dict'>
+        >>> len(query_vec)  # Only non-zero dimensions
+        156
+
+        >>> # Document embedding (shares model with query_emb)
+        >>> doc_emb = DefaultLocalSparseEmbedding(encoding_type="document")
+        >>> doc_vec = doc_emb.embed("Machine learning is a subset of AI")
+        >>> # Total memory: ~200MB (not 400MB) thanks to model caching
+
+        >>> # Asymmetric retrieval example
+        >>> query_vec = query_emb.embed("what causes aging fast")
+        >>> doc_vec = doc_emb.embed(
+        ...     "UV-A light causes tanning, skin aging, and cataracts..."
+        ... )
+        >>>
+        >>> # Calculate similarity (dot product for sparse vectors)
+        >>> similarity = sum(
+        ...     query_vec.get(k, 0) * doc_vec.get(k, 0)
+        ...     for k in set(query_vec) | set(doc_vec)
+        ... )
+
+        >>> # Batch processing
+        >>> queries = ["query 1", "query 2", "query 3"]
+        >>> query_vecs = [query_emb.embed(q) for q in queries]
+        >>>
+        >>> documents = ["doc 1", "doc 2", "doc 3"]
+        >>> doc_vecs = [doc_emb.embed(d) for d in documents]
+
+        >>> # Inspecting sparse dimensions (output is sorted by indices)
+        >>> query_vec = query_emb.embed("machine learning")
+        >>> list(query_vec.items())[:5]  # First 5 dimensions (by index)
+        [(10, 0.45), (23, 0.87), (56, 0.32), (89, 1.12), (120, 0.65)]
+        >>>
+        >>> # Sort by weight to find most important terms
+        >>> sorted_by_weight = sorted(query_vec.items(), key=lambda x: x[1], reverse=True)
+        >>> top_5 = sorted_by_weight[:5]  # Top 5 most important terms
+        >>> top_5
+        [(1023, 1.45), (245, 1.23), (8901, 0.98), (5678, 0.87), (12034, 0.76)]
+
+        >>> # Using GPU for faster inference
+        >>> sparse_emb = DefaultLocalSparseEmbedding(device="cuda")
+        >>> vector = sparse_emb.embed("natural language processing")
+
+        >>> # Hybrid retrieval example (combining dense + sparse)
+        >>> from zvec.extension import DefaultDenseEmbedding
+        >>> dense_emb = DefaultDenseEmbedding()
+        >>> sparse_emb = DefaultLocalSparseEmbedding()
+        >>>
+        >>> query = "deep learning neural networks"
+        >>> dense_vec = dense_emb.embed(query)   # [0.1, -0.3, 0.5, ...]
+        >>> sparse_vec = sparse_emb.embed(query)  # {12: 0.8, 45: 1.2, ...}
+
+        >>> # Error handling
+        >>> try:
+        ...     sparse_emb.embed("")  # Empty string
+        ... except ValueError as e:
+        ...     print(f"Error: {e}")
+        Error: Input text cannot be empty or whitespace only
+
+        >>> # Cache management
+        >>> # Check cache status
+        >>> info = DefaultLocalSparseEmbedding.get_cache_info()
+        >>> print(f"Cached models: {info['cached_models']}")
+        Cached models: 1
+        >>>
+        >>> # Clear cache to free memory
+        >>> DefaultLocalSparseEmbedding.clear_cache()
+        >>> info = DefaultLocalSparseEmbedding.get_cache_info()
+        >>> print(f"Cached models: {info['cached_models']}")
+        Cached models: 0
+        >>>
+        >>> # Remove specific model from cache
+        >>> query_emb = DefaultLocalSparseEmbedding()  # Creates CPU model
+        >>> cuda_emb = DefaultLocalSparseEmbedding(device="cuda")  # Creates CUDA model
+        >>> info = DefaultLocalSparseEmbedding.get_cache_info()
+        >>> print(f"Cached models: {info['cached_models']}")
+        Cached models: 2
+        >>>
+        >>> # Remove only CPU model
+        >>> removed = DefaultLocalSparseEmbedding.remove_from_cache(device=None)
+        >>> print(f"Removed: {removed}")
+        True
+        >>> info = DefaultLocalSparseEmbedding.get_cache_info()
+        >>> print(f"Cached models: {info['cached_models']}")
+        Cached models: 1
+
+    See Also:
+        - ``SparseEmbeddingFunction``: Base class for sparse embeddings
+        - ``DefaultDenseEmbedding``: Dense embedding with all-MiniLM-L6-v2
+        - ``QwenDenseEmbedding``: Alternative using Qwen API
+
+    References:
+        - SPLADE Paper: https://arxiv.org/abs/2109.10086
+        - Model: https://huggingface.co/naver/splade-cocondenser-ensembledistil
+    """
+
+    # Class-level model cache: {(model_name, model_source, device): model}
+    # Shared across all DefaultLocalSparseEmbedding instances to save memory
+    _model_cache: ClassVar[dict] = {}
+
+    @classmethod
+    def clear_cache(cls) -> None:
+        """Clear all cached SPLADE models from memory.
+
+        This is useful for:
+        - Freeing memory when models are no longer needed
+        - Forcing a fresh model reload
+        - Testing and debugging
+                Examples:
+            >>> # Clear cache to free memory
+            >>> DefaultLocalSparseEmbedding.clear_cache()
+
+            >>> # Or in tests to ensure fresh model loading
+            >>> def test_something():
+            ...     DefaultLocalSparseEmbedding.clear_cache()
+            ...     emb = DefaultLocalSparseEmbedding()
+            ...     # Test with fresh model
+        """
+        cls._model_cache.clear()
+
+    @classmethod
+    def get_cache_info(cls) -> dict:
+        """Get information about currently cached models.
+
+        Returns:
+            dict: Dictionary with cache statistics:
+                - cached_models (int): Number of cached model instances
+                - cache_keys (list): List of cache keys (model_name, model_source, device)
+
+        Examples:
+            >>> info = DefaultLocalSparseEmbedding.get_cache_info()
+            >>> print(f"Cached models: {info['cached_models']}")
+            Cached models: 2
+            >>> print(f"Cache keys: {info['cache_keys']}")
+            Cache keys: [('naver/splade-cocondenser-ensembledistil', 'huggingface', None),
+                        ('naver/splade-cocondenser-ensembledistil', 'huggingface', 'cuda')]
+        """
+        return {
+            "cached_models": len(cls._model_cache),
+            "cache_keys": list(cls._model_cache.keys()),
+        }
+
+    @classmethod
+    def remove_from_cache(
+        cls, model_source: str = "huggingface", device: Optional[str] = None
+    ) -> bool:
+        """Remove a specific model from cache.
+
+        Args:
+            model_source (str): Model source ("huggingface" or "modelscope").
+                Defaults to "huggingface".
+            device (Optional[str]): Device identifier. Defaults to None.
+
+        Returns:
+            bool: True if model was found and removed, False otherwise.
+
+        Examples:
+            >>> # Remove CPU model from cache
+            >>> removed = DefaultLocalSparseEmbedding.remove_from_cache()
+            >>> print(f"Removed: {removed}")
+            True
+
+            >>> # Remove CUDA model from cache
+            >>> removed = DefaultLocalSparseEmbedding.remove_from_cache(device="cuda")
+            >>> print(f"Removed: {removed}")
+            True
+        """
+        model_name = "naver/splade-cocondenser-ensembledistil"
+        cache_key = (model_name, model_source, device)
+
+        if cache_key in cls._model_cache:
+            del cls._model_cache[cache_key]
+            return True
+        return False
+
+    def __init__(
+        self,
+        model_source: Literal["huggingface", "modelscope"] = "huggingface",
+        device: Optional[str] = None,
+        encoding_type: Literal["query", "document"] = "query",
+        **kwargs,
+    ):
+        """Initialize with SPLADE model.
+
+        Args:
+            model_source (Literal["huggingface", "modelscope"]): Model source.
+                Defaults to "huggingface".
+            device (Optional[str]): Target device ("cpu", "cuda", "mps", or None).
+                Defaults to None (automatic detection).
+            encoding_type (Literal["query", "document"]): Encoding type for embeddings.
+                - "query": Optimize for search queries (default)
+                - "document": Optimize for indexed documents
+                This distinction is important for asymmetric retrieval tasks.
+            **kwargs: Additional parameters (reserved for future use).
+
+        Raises:
+            ImportError: If sentence-transformers is not installed.
+            ValueError: If model cannot be loaded.
+
+        Note:
+            Multiple instances with the same (model_source, device) configuration
+            will share the same underlying model to save memory. Different
+            instances can use different encoding_type settings while sharing
+            the model.
+
+            **Model Selection:**
+
+            Uses ``naver/splade-cocondenser-ensembledistil`` instead of the newer
+            ``naver/splade-v3`` because splade-v3 is a gated model requiring
+            Hugging Face authentication. The cocondenser-ensembledistil variant:
+
+            - Does not require authentication or API tokens
+            - Is immediately available for all users
+            - Provides comparable retrieval performance (~2% difference)
+            - Avoids "Access to model is restricted" errors
+
+            If you need splade-v3 and have obtained access, you can subclass
+            this class and override the model_name parameter.
+
+        Examples:
+            >>> # Both instances share the same model (saves memory)
+            >>> query_emb = DefaultLocalSparseEmbedding(encoding_type="query")
+            >>> doc_emb = DefaultLocalSparseEmbedding(encoding_type="document")
+            >>> # Only one model is loaded in memory
+        """
+        # Use publicly available SPLADE model (no gated access required)
+        # Note: naver/splade-v3 requires authentication, so we use the
+        # cocondenser-ensembledistil variant which is publicly accessible
+        model_name = "naver/splade-cocondenser-ensembledistil"
+
+        # Initialize base class for model loading
+        SentenceTransformerFunctionBase.__init__(
+            self, model_name=model_name, model_source=model_source, device=device
+        )
+
+        self._encoding_type = encoding_type
+        self._extra_params = kwargs
+
+        # Create cache key for this model configuration
+        self._cache_key = (model_name, model_source, device)
+
+        # Load model to ensure it's available (will use cache if exists)
+        self._get_model()
+
+    @property
+    def extra_params(self) -> dict:
+        """dict: Extra parameters for model-specific customization."""
+        return self._extra_params
+
+    def __call__(self, input: str) -> SparseVectorType:
+        """Make the embedding function callable."""
+        return self.embed(input)
+
+    def embed(self, input: str) -> SparseVectorType:
+        """Generate sparse embedding vector for the input text.
+
+        This method uses the SPLADE model to convert input text into a sparse
+        vector representation. The result is a dictionary where keys are dimension
+        indices and values are importance weights (only non-zero values included).
+
+        The embedding is optimized based on the ``encoding_type`` specified during
+        initialization: "query" for search queries or "document" for indexed content.
+
+        Args:
+            input (str): Input text string to embed. Must be non-empty after
+                stripping whitespace.
+
+        Returns:
+            SparseVectorType: A dictionary mapping dimension index to weight.
+                Only non-zero dimensions are included. The dictionary is sorted
+                by indices (keys) in ascending order for consistent output.
+                Example: ``{10: 0.5, 245: 0.8, 1023: 1.2, 5678: 0.5}``
+
+        Raises:
+            TypeError: If ``input`` is not a string.
+            ValueError: If input is empty or whitespace-only.
+            RuntimeError: If model inference fails.
+
+        Examples:
+            >>> # Query embedding
+            >>> query_emb = DefaultLocalSparseEmbedding(encoding_type="query")
+            >>> query_vec = query_emb.embed("machine learning")
+            >>> isinstance(query_vec, dict)
+            True
+
+        Note:
+            - First call may be slower due to model loading
+            - Subsequent calls are much faster as the model stays in memory
+            - GPU acceleration provides significant speedup
+            - Sparse vectors are memory-efficient (only store non-zero values)
+        """
+        if not isinstance(input, str):
+            raise TypeError(f"Expected 'input' to be str, got {type(input).__name__}")
+
+        input = input.strip()
+        if not input:
+            raise ValueError("Input text cannot be empty or whitespace only")
+
+        try:
+            model = self._get_model()
+
+            # Use appropriate encoding method based on type
+            if self._encoding_type == "document" and hasattr(model, "encode_document"):
+                # Use document encoding
+                sparse_matrix = model.encode_document([input])
+            elif hasattr(model, "encode_query"):
+                # Use query encoding (default)
+                sparse_matrix = model.encode_query([input])
+            else:
+                # Fallback: manual implementation for older sentence-transformers
+                return self._manual_sparse_encode(input)
+
+            # Convert sparse matrix to dictionary
+            # SPLADE returns shape [1, vocab_size] for single input
+
+            # Check if it's a sparse matrix (duck typing - has toarray method)
+            if hasattr(sparse_matrix, "toarray"):
+                # Sparse matrix (CSR/CSC/etc.) - convert to dense array
+                sparse_array = sparse_matrix[0].toarray().flatten()
+                sparse_dict = {
+                    int(idx): float(val)
+                    for idx, val in enumerate(sparse_array)
+                    if val > 0
+                }
+            else:
+                # Dense array format (numpy array or similar)
+                if isinstance(sparse_matrix, np.ndarray):
+                    sparse_array = sparse_matrix[0]
+                else:
+                    sparse_array = sparse_matrix
+
+                sparse_dict = {
+                    int(idx): float(val)
+                    for idx, val in enumerate(sparse_array)
+                    if val > 0
+                }
+
+            # Sort by indices (keys) to ensure consistent ordering
+            return dict(sorted(sparse_dict.items()))
+
+        except Exception as e:
+            if isinstance(e, (TypeError, ValueError)):
+                raise
+            raise RuntimeError(f"Failed to generate sparse embedding: {e!s}") from e
+
+    def _manual_sparse_encode(self, input: str) -> SparseVectorType:
+        """Fallback manual SPLADE encoding for older sentence-transformers.
+
+        Args:
+            input (str): Input text to encode.
+
+        Returns:
+            SparseVectorType: Sparse vector as dictionary.
+        """
+        import torch
+
+        model = self._get_model()
+
+        # Tokenize input
+        features = model.tokenize([input])
+
+        # Move to correct device
+        features = {k: v.to(model.device) for k, v in features.items()}
+
+        # Forward pass with no gradient
+        with torch.no_grad():
+            embeddings = model.forward(features)
+
+            # Get logits from model output
+            # SPLADE models typically output 'token_embeddings'
+            if isinstance(embeddings, dict) and "token_embeddings" in embeddings:
+                logits = embeddings["token_embeddings"][0]  # First batch item
+            elif hasattr(embeddings, "token_embeddings"):
+                logits = embeddings.token_embeddings[0]
+            # Fallback: try to get first value
+            elif isinstance(embeddings, dict):
+                logits = next(iter(embeddings.values()))[0]
+            else:
+                logits = embeddings[0]
+
+            # Apply SPLADE activation: log(1 + relu(x))
+            relu_log = torch.log(1 + torch.relu(logits))
+
+            # Max pooling over token dimension (reduce to vocab size)
+            if relu_log.dim() > 1:
+                sparse_vec, _ = torch.max(relu_log, dim=0)
+            else:
+                sparse_vec = relu_log
+
+            # Convert to sparse dictionary (only non-zero values)
+            sparse_vec_np = sparse_vec.cpu().numpy()
+            sparse_dict = {
+                int(idx): float(val) for idx, val in enumerate(sparse_vec_np) if val > 0
+            }
+
+            # Sort by indices (keys) to ensure consistent ordering
+            return dict(sorted(sparse_dict.items()))
+
+    def _get_model(self):
+        """Load or retrieve the SPLADE model from class-level cache.
+
+        Returns:
+            SentenceTransformer: The loaded SPLADE model instance.
+
+        Raises:
+            ImportError: If required packages are not installed.
+            ValueError: If model cannot be loaded.
+
+        Note:
+            Models are cached at class level and shared across all instances
+            with the same (model_name, model_source, device) configuration.
+            This allows memory-efficient usage when creating multiple instances
+            with different encoding_type settings.
+        """
+        # Check class-level cache first
+        if self._cache_key in self._model_cache:
+            return self._model_cache[self._cache_key]
+
+        # Use parent class method to load model
+        model = super()._get_model()
+
+        # Cache the model at class level
+        self._model_cache[self._cache_key] = model
+
+        return model
diff --git a/python/zvec/extension/sentence_transformer_function.py b/python/zvec/extension/sentence_transformer_function.py
new file mode 100644
index 00000000..1ba1662a
--- /dev/null
+++ b/python/zvec/extension/sentence_transformer_function.py
@@ -0,0 +1,150 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import Literal, Optional
+
+from ..tool import require_module
+
+
+class SentenceTransformerFunctionBase:
+    """Base class for Sentence Transformer functions (both dense and sparse).
+
+    This base class provides common functionality for loading and managing
+    sentence-transformers models from Hugging Face or ModelScope. It supports
+    both dense models (e.g., all-MiniLM-L6-v2) and sparse models (e.g., SPLADE).
+
+    This class is not meant to be used directly. Use concrete implementations:
+    - ``SentenceTransformerEmbeddingFunction`` for dense embeddings
+    - ``SentenceTransformerSparseEmbeddingFunction`` for sparse embeddings
+    - ``DefaultDenseEmbedding`` for default dense embeddings
+    - ``DefaultSparseEmbedding`` for default sparse embeddings
+
+    Args:
+        model_name (str): Model identifier or local path.
+        model_source (Literal["huggingface", "modelscope"]): Model source.
+        device (Optional[str]): Device to run the model on.
+
+    Note:
+        - This is an internal base class for code reuse
+        - Subclasses should inherit from appropriate Protocol (Dense/Sparse)
+        - Provides model loading and management functionality
+    """
+
+    def __init__(
+        self,
+        model_name: str,
+        model_source: Literal["huggingface", "modelscope"] = "huggingface",
+        device: Optional[str] = None,
+    ):
+        """Initialize the base Sentence Transformer functionality.
+
+        Args:
+            model_name (str): Model identifier or local path.
+            model_source (Literal["huggingface", "modelscope"]): Model source.
+            device (Optional[str]): Device to run the model on.
+
+        Raises:
+            ValueError: If model_source is invalid.
+        """
+        # Validate model_source
+        if model_source not in ("huggingface", "modelscope"):
+            raise ValueError(
+                f"Invalid model_source: '{model_source}'. "
+                "Must be 'huggingface' or 'modelscope'."
+            )
+
+        self._model_name = model_name
+        self._model_source = model_source
+        self._device = device
+        self._model = None
+
+    @property
+    def model_name(self) -> str:
+        """str: The Sentence Transformer model name currently in use."""
+        return self._model_name
+
+    @property
+    def model_source(self) -> str:
+        """str: The model source being used ("huggingface" or "modelscope")."""
+        return self._model_source
+
+    @property
+    def device(self) -> str:
+        """str: The device the model is running on."""
+        model = self._get_model()
+        if model is not None:
+            return str(model.device)
+        return self._device or "cpu"
+
+    def _get_model(self):
+        """Load or retrieve the Sentence Transformer model.
+
+        Returns:
+            SentenceTransformer or SparseEncoder: The loaded model instance.
+
+        Raises:
+            ImportError: If required packages are not installed.
+            ValueError: If model cannot be loaded.
+        """
+        # Return cached model if exists
+        if self._model is not None:
+            return self._model
+
+        # Load model
+        try:
+            sentence_transformers = require_module("sentence_transformers")
+
+            if self._model_source == "modelscope":
+                # Load from ModelScope
+                require_module("modelscope")
+                from modelscope.hub.snapshot_download import snapshot_download
+
+                # Download model to cache
+                model_dir = snapshot_download(self._model_name)
+
+                # Load from local path
+                self._model = sentence_transformers.SentenceTransformer(
+                    model_dir, device=self._device, trust_remote_code=True
+                )
+            else:
+                # Load from Hugging Face (default)
+                self._model = sentence_transformers.SentenceTransformer(
+                    self._model_name, device=self._device, trust_remote_code=True
+                )
+
+            return self._model
+
+        except ImportError as e:
+            if "modelscope" in str(e) and self._model_source == "modelscope":
+                raise ImportError(
+                    "ModelScope support requires the 'modelscope' package. "
+                    "Please install it with: pip install modelscope"
+                ) from e
+            raise
+        except Exception as e:
+            raise ValueError(
+                f"Failed to load Sentence Transformer model '{self._model_name}' "
+                f"from {self._model_source}: {e!s}"
+            ) from e
+
+    def _is_sparse_model(self) -> bool:
+        """Check if the loaded model is a sparse encoder (e.g., SPLADE).
+
+        Returns:
+            bool: True if model supports sparse encoding.
+        """
+        model = self._get_model()
+        # Check if model has sparse encoding methods
+        return hasattr(model, "encode_query") or hasattr(model, "encode_document")
diff --git a/python/zvec/extension/sentence_transformer_rerank_function.py b/python/zvec/extension/sentence_transformer_rerank_function.py
new file mode 100644
index 00000000..58c5838f
--- /dev/null
+++ b/python/zvec/extension/sentence_transformer_rerank_function.py
@@ -0,0 +1,384 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import Literal, Optional
+
+from ..model.doc import Doc
+from ..tool import require_module
+from .rerank_function import RerankFunction
+from .sentence_transformer_function import SentenceTransformerFunctionBase
+
+
+class DefaultLocalReRanker(SentenceTransformerFunctionBase, RerankFunction):
+    """Re-ranker using Sentence Transformer cross-encoder models for semantic re-ranking.
+
+    This re-ranker leverages pre-trained cross-encoder models to perform deep semantic
+    re-ranking of search results. It runs locally without API calls, supports GPU
+    acceleration, and works with models from Hugging Face or ModelScope.
+
+    Cross-encoder models evaluate query-document pairs jointly, providing more
+    accurate relevance scores than bi-encoder (embedding-based) similarity.
+
+    Args:
+        query (str): Query text for semantic re-ranking. **Required**.
+        topn (int, optional): Maximum number of documents to return after re-ranking.
+            Defaults to 10.
+        rerank_field (Optional[str], optional): Document field name to use as
+            re-ranking input text. **Required** (e.g., "content", "title", "body").
+        model_name (str, optional): Cross-encoder model identifier or local path.
+            Defaults to ``"cross-encoder/ms-marco-MiniLM-L6-v2"`` (MS MARCO MiniLM).
+            Common options:
+            - ``"cross-encoder/ms-marco-MiniLM-L6-v2"``: Lightweight, fast (~80MB, recommended)
+            - ``"cross-encoder/ms-marco-MiniLM-L12-v2"``: Better accuracy (~120MB)
+            - ``"BAAI/bge-reranker-base"``: BGE Reranker Base (~280MB)
+            - ``"BAAI/bge-reranker-large"``: BGE Reranker Large (highest quality, ~560MB)
+        model_source (Literal["huggingface", "modelscope"], optional): Model source.
+            Defaults to ``"huggingface"``.
+            - ``"huggingface"``: Load from Hugging Face Hub
+            - ``"modelscope"``: Load from ModelScope (recommended for users in China)
+        device (Optional[str], optional): Device to run the model on.
+            Options: ``"cpu"``, ``"cuda"``, ``"mps"`` (for Apple Silicon), or ``None``
+            for automatic detection. Defaults to ``None``.
+        batch_size (int, optional): Batch size for processing query-document pairs.
+            Larger values speed up processing but use more memory. Defaults to ``32``.
+
+    Attributes:
+        query (str): The query text used for re-ranking.
+        topn (int): Maximum number of documents to return.
+        rerank_field (Optional[str]): Field name used for re-ranking input.
+        model_name (str): The cross-encoder model being used.
+        model_source (str): The model source ("huggingface" or "modelscope").
+        device (str): The device the model is running on.
+
+    Raises:
+        ValueError: If ``query`` is empty/None, ``rerank_field`` is None,
+            or model cannot be loaded.
+        TypeError: If input types are invalid.
+        RuntimeError: If model inference fails.
+
+    Note:
+        - Requires Python 3.10, 3.11, or 3.12
+        - Requires ``sentence-transformers`` package: ``pip install sentence-transformers``
+        - For ModelScope support, also requires: ``pip install modelscope``
+        - First run downloads the model (~80-560MB depending on model) from chosen source
+        - No API keys or network required after initial download
+        - Cross-encoders are slower than bi-encoders but more accurate
+        - GPU acceleration provides significant speedup (5-10x)
+
+        **MS MARCO MiniLM-L6-v2 Model (Default):**
+
+        The default model ``cross-encoder/ms-marco-MiniLM-L6-v2`` is a lightweight and
+        efficient cross-encoder trained on MS MARCO dataset. It provides:
+
+        - Fast inference speed (suitable for real-time applications)
+        - Small model size (~80MB, quick to download)
+        - Good balance between speed and accuracy
+        - Trained on 500K+ query-document pairs
+        - Public availability without authentication
+
+        **For users in China:**
+
+        If you encounter Hugging Face access issues, use ModelScope instead:
+
+        .. code-block:: python
+
+            # Recommended for users in China
+            reranker = SentenceTransformerReRanker(
+                query="机器学习算法",
+                rerank_field="content",
+                model_source="modelscope"
+            )
+
+        Alternatively, use Hugging Face mirror:
+
+        .. code-block:: bash
+
+            export HF_ENDPOINT=https://hf-mirror.com
+
+    Examples:
+        >>> # Basic usage with default MS MARCO MiniLM model
+        >>> from zvec.extension import SentenceTransformerReRanker
+        >>>
+        >>> reranker = SentenceTransformerReRanker(
+        ...     query="machine learning algorithms",
+        ...     topn=5,
+        ...     rerank_field="content"
+        ... )
+        >>>
+        >>> # Use in collection.query()
+        >>> results = collection.query(
+        ...     data={"vector_field": query_vector},
+        ...     reranker=reranker,
+        ...     topk=20
+        ... )
+
+        >>> # Using ModelScope for users in China
+        >>> reranker = SentenceTransformerReRanker(
+        ...     query="深度学习",
+        ...     topn=10,
+        ...     rerank_field="content",
+        ...     model_source="modelscope"
+        ... )
+
+        >>> # Using larger model for better quality
+        >>> reranker = SentenceTransformerReRanker(
+        ...     query="neural networks",
+        ...     topn=5,
+        ...     rerank_field="content",
+        ...     model_name="BAAI/bge-reranker-large",
+        ...     device="cuda",
+        ...     batch_size=64
+        ... )
+
+        >>> # Direct rerank call (for testing)
+        >>> query_results = {
+        ...     "vector1": [
+        ...         Doc(id="1", score=0.9, fields={"content": "Machine learning is..."}),
+        ...         Doc(id="2", score=0.8, fields={"content": "Deep learning is..."}),
+        ...     ]
+        ... }
+        >>> reranked = reranker.rerank(query_results)
+        >>> for doc in reranked:
+        ...     print(f"ID: {doc.id}, Score: {doc.score:.4f}")
+        ID: 2, Score: 0.9234
+        ID: 1, Score: 0.8567
+
+    See Also:
+        - ``RerankFunction``: Abstract base class for re-rankers
+        - ``QwenReRanker``: Re-ranker using Qwen API
+        - ``RrfReRanker``: Multi-vector re-ranker using RRF
+        - ``WeightedReRanker``: Multi-vector re-ranker using weighted scores
+
+    References:
+        - MS MARCO Cross-Encoder: https://huggingface.co/cross-encoder/ms-marco-MiniLM-L6-v2
+        - BGE Reranker: https://huggingface.co/BAAI/bge-reranker-base
+        - Cross-Encoder vs Bi-Encoder: https://www.sbert.net/examples/applications/cross-encoder/README.html
+    """
+
+    def __init__(
+        self,
+        query: Optional[str] = None,
+        topn: int = 10,
+        rerank_field: Optional[str] = None,
+        model_name: str = "cross-encoder/ms-marco-MiniLM-L6-v2",
+        model_source: Literal["huggingface", "modelscope"] = "huggingface",
+        device: Optional[str] = None,
+        batch_size: int = 32,
+    ):
+        """Initialize SentenceTransformerReRanker with query and configuration.
+
+        Args:
+            query (Optional[str]): Query text for semantic matching. Required.
+            topn (int): Number of top results to return.
+            rerank_field (Optional[str]): Document field for re-ranking input.
+            model_name (str): Cross-encoder model identifier.
+            model_source (Literal["huggingface", "modelscope"]): Model source.
+            device (Optional[str]): Target device ("cpu", "cuda", "mps", or None).
+            batch_size (int): Batch size for processing query-document pairs.
+
+        Raises:
+            ValueError: If query is empty or model cannot be loaded.
+        """
+        # Initialize base class for model loading
+        SentenceTransformerFunctionBase.__init__(
+            self, model_name=model_name, model_source=model_source, device=device
+        )
+
+        # Initialize rerank function
+        RerankFunction.__init__(self, topn=topn, rerank_field=rerank_field)
+
+        # Validate query
+        if not query:
+            raise ValueError("Query is required for DefaultLocalReRanker")
+        self._query = query
+        self._batch_size = batch_size
+
+        # Load and validate cross-encoder model
+        model = self._get_model()
+        if not hasattr(model, "predict"):
+            raise ValueError(
+                f"Model '{model_name}' does not appear to be a cross-encoder model. "
+                "Cross-encoder models should have a 'predict' method."
+            )
+        self._model = model
+
+    def _get_model(self):
+        """Load or retrieve the CrossEncoder model.
+
+        This overrides the base class method to load CrossEncoder instead of
+        SentenceTransformer, as reranking requires cross-encoder models.
+
+        Returns:
+            CrossEncoder: The loaded cross-encoder model instance.
+
+        Raises:
+            ImportError: If required packages are not installed.
+            ValueError: If model cannot be loaded.
+        """
+        # Return cached model if exists
+        if self._model is not None:
+            return self._model
+
+        # Load cross-encoder model
+        try:
+            sentence_transformers = require_module("sentence_transformers")
+
+            if self._model_source == "modelscope":
+                # Load from ModelScope
+                require_module("modelscope")
+                from modelscope.hub.snapshot_download import snapshot_download
+
+                # Download model to cache
+                model_dir = snapshot_download(self._model_name)
+
+                # Load CrossEncoder from local path
+                model = sentence_transformers.CrossEncoder(
+                    model_dir, device=self._device
+                )
+            else:
+                # Load CrossEncoder from Hugging Face (default)
+                model = sentence_transformers.CrossEncoder(
+                    self._model_name, device=self._device
+                )
+
+            return model
+
+        except ImportError as e:
+            if "modelscope" in str(e) and self._model_source == "modelscope":
+                raise ImportError(
+                    "ModelScope support requires the 'modelscope' package. "
+                    "Please install it with: pip install modelscope"
+                ) from e
+            raise
+        except Exception as e:
+            raise ValueError(
+                f"Failed to load CrossEncoder model '{self._model_name}' "
+                f"from {self._model_source}: {e!s}"
+            ) from e
+
+    @property
+    def query(self) -> str:
+        """str: Query text used for semantic re-ranking."""
+        return self._query
+
+    @property
+    def batch_size(self) -> int:
+        """int: Batch size for processing query-document pairs."""
+        return self._batch_size
+
+    def rerank(self, query_results: dict[str, list[Doc]]) -> list[Doc]:
+        """Re-rank documents using Sentence Transformer cross-encoder model.
+
+        Evaluates each query-document pair using the cross-encoder model to compute
+        relevance scores. Documents are then sorted by these scores and the top-k
+        results are returned.
+
+        Args:
+            query_results (dict[str, list[Doc]]): Mapping from vector field names
+                to lists of retrieved documents. Documents from all fields are
+                deduplicated and re-ranked together.
+
+        Returns:
+            list[Doc]: Re-ranked documents (up to ``topn``) with updated ``score``
+                fields containing relevance scores from the cross-encoder model.
+
+        Raises:
+            ValueError: If no valid documents are found or model inference fails.
+
+        Note:
+            - Duplicate documents (same ID) across fields are processed once
+            - Documents with empty/missing ``rerank_field`` content are skipped
+            - Returned scores are logits from the cross-encoder model
+            - Higher scores indicate higher relevance
+            - Processing time is O(n) where n is the number of documents
+
+        Examples:
+            >>> reranker = SentenceTransformerReRanker(
+            ...     query="machine learning",
+            ...     topn=3,
+            ...     rerank_field="content"
+            ... )
+            >>> query_results = {
+            ...     "vector1": [
+            ...         Doc(id="1", score=0.9, fields={"content": "ML basics"}),
+            ...         Doc(id="2", score=0.8, fields={"content": "DL tutorial"}),
+            ...     ]
+            ... }
+            >>> reranked = reranker.rerank(query_results)
+            >>> len(reranked) <= 3
+            True
+        """
+        if not query_results:
+            return []
+
+        # Collect and deduplicate documents
+        id_to_doc: dict[str, Doc] = {}
+        doc_ids: list[str] = []
+        contents: list[str] = []
+
+        for _, query_result in query_results.items():
+            for doc in query_result:
+                doc_id = doc.id
+                if doc_id in id_to_doc:
+                    continue
+
+                # Extract text content from specified field
+                field_value = doc.field(self.rerank_field)
+                rank_content = str(field_value).strip() if field_value else ""
+                if not rank_content:
+                    continue
+
+                id_to_doc[doc_id] = doc
+                doc_ids.append(doc_id)
+                contents.append(rank_content)
+
+        if not contents:
+            raise ValueError("No documents to rerank")
+
+        try:
+            # Use standard cross-encoder predict method
+            pairs = [[self.query, content] for content in contents]
+            scores = self._model.predict(
+                pairs,
+                batch_size=self.batch_size,
+                show_progress_bar=False,
+                convert_to_numpy=True,
+            )
+
+            # Convert to float list if needed
+            if hasattr(scores, "tolist"):
+                scores = scores.tolist()
+            else:
+                scores = [float(s) for s in scores]
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to compute rerank scores: {e!s}") from e
+
+        # Create scored documents
+        scored_docs = [
+            (doc_ids[i], id_to_doc[doc_ids[i]], scores[i]) for i in range(len(doc_ids))
+        ]
+
+        # Sort by score (descending) and take top-k
+        scored_docs.sort(key=lambda x: x[2], reverse=True)
+        top_scored_docs = scored_docs[: self.topn]
+
+        # Build result list with updated scores
+        results: list[Doc] = []
+        for _, doc, score in top_scored_docs:
+            new_doc = doc._replace(score=score)
+            results.append(new_doc)
+
+        return results
diff --git a/python/zvec/tool/util.py b/python/zvec/tool/util.py
index a836876c..409a4d5b 100644
--- a/python/zvec/tool/util.py
+++ b/python/zvec/tool/util.py
@@ -59,5 +59,5 @@ def require_module(module: str, mitigation: Optional[str] = None) -> Any:
             else:
                 msg += f"please pip install '{top_level}'."
         else:
-            msg += f"Please pip install '{package}."
+            msg += f"Please pip install '{package}'."
         raise ImportError(msg) from e

From b83cf52be855fab84a467a914e6dbc40a93fdaa3 Mon Sep 17 00:00:00 2001
From: Cuiys <cuiyushuai.cys@alibaba-inc.com>
Date: Thu, 12 Feb 2026 23:38:18 +0800
Subject: [PATCH 15/28] fix(py): py with build-in package typing not
 typing_extensions (#99)

---
 python/zvec/common/constants.py             | 3 +--
 python/zvec/extension/embedding_function.py | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/zvec/common/constants.py b/python/zvec/common/constants.py
index c8da216c..6a1654df 100644
--- a/python/zvec/common/constants.py
+++ b/python/zvec/common/constants.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 from __future__ import annotations
 
-from typing import Optional, Union
+from typing import Optional, TypeVar, Union
 
 import numpy as np
-from typing_extensions import TypeVar
 
 # VectorType: DenseVectorType | SparseVectorType
 DenseVectorType = Union[list[float], list[int], np.ndarray]
diff --git a/python/zvec/extension/embedding_function.py b/python/zvec/extension/embedding_function.py
index a58ba239..a421f1ec 100644
--- a/python/zvec/extension/embedding_function.py
+++ b/python/zvec/extension/embedding_function.py
@@ -14,8 +14,7 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-
-from typing_extensions import Protocol, runtime_checkable
+from typing import Protocol, runtime_checkable
 
 from ..common.constants import MD, DenseVectorType, SparseVectorType
 

From c79f0b0d7ea637323aac5bf75c52e4ce9e8392af Mon Sep 17 00:00:00 2001
From: Qinren Zhou <zhouqinren.zqr@alibaba-inc.com>
Date: Fri, 13 Feb 2026 14:04:58 +0800
Subject: [PATCH 16/28] minor: add installation instruction for node.js package
 (#103)

---
 README.md | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 4a9f9611..96d91f51 100644
--- a/README.md
+++ b/README.md
@@ -30,25 +30,33 @@
 ## 💫 Features
 
 - **Blazing Fast**: Searches billions of vectors in milliseconds.
-- **Simple, Just Works**: Install with `pip install zvec` and start searching in seconds. No servers, no config, no fuss.
+- **Simple, Just Works**: [Install](#-installation) and start searching in seconds. No servers, no config, no fuss.
 - **Dense + Sparse Vectors**: Work with both dense and sparse embeddings, with native support for multi-vector queries in a single call.
 - **Hybrid Search**: Combine semantic similarity with structured filters for precise results.
 - **Runs Anywhere**: As an in-process library, Zvec runs wherever your code runs — notebooks, servers, CLI tools, or even edge devices.
 
 ## 📦 Installation
 
-Install Zvec from PyPI with a single command:
+### Python
+
+**Requirements**: Python 3.10 - 3.12
 
 ```bash
 pip install zvec
 ```
 
-**Requirements**:
+### Node.js
+
+```bash
+npm install @zvec/zvec
+```
+
+### ✅ Supported Platforms
+
+- Linux (x86_64, ARM64)
+- macOS (ARM64)
 
-- Python 3.10 - 3.12
-- **Supported platforms**:
-  - Linux (x86_64/ARM64)
-  - macOS (ARM64)
+### 🛠️ Building from Source
 
 If you prefer to build Zvec from source, please check the [Building from Source](https://zvec.org/en/docs/build/) guide.
 
@@ -100,11 +108,11 @@ Stay updated and get support — scan or click:
   <tr>
     <td align="center" style="padding: 8px; width: 25%;">
       <div style="font-weight: 600; font-size: 14px; margin-bottom: 6px;">💬 DingTalk</div>
-      <img src="https://zvec.oss-cn-hongkong.aliyuncs.com/qrcode/dingding.png" width="100" style="border-radius: 8px; border: 1px solid #ddd;">
+      <img src="https://zvec.oss-cn-hongkong.aliyuncs.com/qrcode/dingding.png" alt="DingTalk QR Code" width="100" style="border-radius: 8px; border: 1px solid #ddd;">
     </td>
     <td align="center" style="padding: 8px; width: 25%;">
       <div style="font-weight: 600; font-size: 14px; margin-bottom: 6px;">📱 WeChat</div>
-      <img src="https://zvec.oss-cn-hongkong.aliyuncs.com/qrcode/wechat.png" width="100" style="border-radius: 8px; border: 1px solid #ddd;">
+      <img src="https://zvec.oss-cn-hongkong.aliyuncs.com/qrcode/wechat.png" alt="WeChat QR Code" width="100" style="border-radius: 8px; border: 1px solid #ddd;">
     </td>
     <td align="center" style="padding: 8px; width: 25%;">
       <div style="font-weight: 600; font-size: 14px; margin-bottom: 6px;">🎮 Discord</div>

From 42fa524f16ef5d4c729558a762d0f10bb97ae0d2 Mon Sep 17 00:00:00 2001
From: Cuiys <cuiyushuai.cys@alibaba-inc.com>
Date: Fri, 13 Feb 2026 14:41:57 +0800
Subject: [PATCH 17/28] feat(ci): macos ci with github-runner (#94)

---
 .github/workflows/build_test_wheel.yml      |   2 +-
 .github/workflows/linux_arm64_docker_ci.yml |  19 +++-
 .github/workflows/linux_x64_docker_ci.yml   |  42 +++++---
 .github/workflows/mac_arm64_ci.yml          | 109 ++++++++++++--------
 pyproject.toml                              |  11 +-
 tests/ailego/parallel/thread_queue_test.cc  |   2 +-
 6 files changed, 117 insertions(+), 68 deletions(-)

diff --git a/.github/workflows/build_test_wheel.yml b/.github/workflows/build_test_wheel.yml
index 65362db2..918a3da5 100644
--- a/.github/workflows/build_test_wheel.yml
+++ b/.github/workflows/build_test_wheel.yml
@@ -101,4 +101,4 @@ jobs:
           pip install --index-url https://test.pypi.org/simple/ zvec
           # Run a simple smoke test
           python -c "import zvec; print('Import OK:', zvec.__version__)"
-        shell: bash
+        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/linux_arm64_docker_ci.yml b/.github/workflows/linux_arm64_docker_ci.yml
index 5e02a95c..4e6b61cf 100644
--- a/.github/workflows/linux_arm64_docker_ci.yml
+++ b/.github/workflows/linux_arm64_docker_ci.yml
@@ -71,7 +71,16 @@ jobs:
 
       - name: Install dependencies
         run: |
-          ${{ env.PIP_BIN }} install --upgrade pip ruff==v0.14.4 clang-format==18.1.8 pybind11==3.0 pytest pytest-cov
+          ${{ env.PIP_BIN }} install --upgrade pip \
+            ruff==v0.14.4 \
+            clang-format==18.1.8 \
+            pybind11==3.0 \
+            cmake==3.30.0 \
+            ninja==1.11.1 \
+            pytest \
+            pytest-cov \
+            scikit-build-core \
+            setuptools_scm
         shell: bash
 
       - name: Run Ruff Linter
@@ -112,11 +121,11 @@ jobs:
           cd "$CLEAN_WORKSPACE"
           NPROC=$(nproc 2>/dev/null || getconf _NPROCESSORS_ONLN 2>/dev/null || echo 2)
 
-          ${{ env.PIP_BIN }} install cmake ninja
-
           CMAKE_GENERATOR="Unix Makefiles" \
           CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \
-          ${{ env.PIP_BIN }} install -v . --config-settings='cmake.define.BUILD_TOOLS="ON"'
+          ${{ env.PIP_BIN }} install -v . \
+            --no-build-isolation \
+            --config-settings='cmake.define.BUILD_TOOLS="ON"'
         shell: bash
 
       - name: Run Python Tests with Coverage
@@ -136,4 +145,4 @@ jobs:
           cd "$CLEAN_WORKSPACE/examples/c++"
           mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release
           make -j $(nproc) && ./db-example && ./core-example && ./ailego-example
-        shell: bash
+        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/linux_x64_docker_ci.yml b/.github/workflows/linux_x64_docker_ci.yml
index 2c5bb2fd..f1fc3c7d 100644
--- a/.github/workflows/linux_x64_docker_ci.yml
+++ b/.github/workflows/linux_x64_docker_ci.yml
@@ -46,6 +46,28 @@ jobs:
           echo "PIP_BIN=$PY_PATH/bin/pip" >> $GITHUB_ENV
           echo "CLANG_FORMATTER_BIN=$PY_PATH/bin/clang-format" >> $GITHUB_ENV
           $PY_PATH/bin/python --version
+
+          # Set number of processors for parallel builds
+          NPROC=$(nproc 2>/dev/null || getconf _NPROCESSORS_ONLN 2>/dev/null || echo 2)
+          echo "NPROC=$NPROC" >> $GITHUB_ENV
+          echo "Using $NPROC parallel jobs for builds"
+          
+          # Add Python user base bin to PATH for pip-installed CLI tools
+          echo "$(python -c 'import site; print(site.USER_BASE)')/bin" >> $GITHUB_PATH
+        shell: bash
+      
+      - name: Install dependencies
+        run: |
+           ${{ env.PYTHON_BIN }} -m pip install --upgrade pip \
+            ruff==v0.14.4 \
+            clang-format==18.1.8 \
+            pybind11==3.0 \
+            cmake==3.30.0 \
+            ninja==1.11.1 \
+            pytest \
+            pytest-cov \
+            scikit-build-core \
+            setuptools_scm
         shell: bash
 
       - name: Prepare clean build directory
@@ -69,11 +91,6 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         shell: bash
 
-      - name: Install dependencies
-        run: |
-          ${{ env.PIP_BIN }} install --upgrade pip ruff==v0.14.4 clang-format==18.1.8 pybind11==3.0 pytest pytest-cov
-        shell: bash
-
       - name: Run Ruff Linter
         run: |
           cd "$CLEAN_WORKSPACE"
@@ -90,7 +107,6 @@ jobs:
         run: |
           cd "$CLEAN_WORKSPACE"
           
-          
           CPP_FILES=$(find . -type f \( -name "*.cpp" -o -name "*.h" -o -name "*.hpp" -o -name "*.cc" -o -name "*.cxx" \) \
             ! -path "./build/*" \
             ! -path "./tests/*" \
@@ -110,13 +126,11 @@ jobs:
       - name: Install Python dependencies and build package
         run: |
           cd "$CLEAN_WORKSPACE"
-          NPROC=$(nproc 2>/dev/null || getconf _NPROCESSORS_ONLN 2>/dev/null || echo 2)
-
-          ${{ env.PIP_BIN }} install cmake ninja
-
           CMAKE_GENERATOR="Unix Makefiles" \
           CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \
-          ${{ env.PIP_BIN }} install -v . --config-settings='cmake.define.BUILD_TOOLS="ON"'
+          ${{ env.PYTHON_BIN }} -m pip install -v . \
+            --no-build-isolation \
+            --config-settings='cmake.define.BUILD_TOOLS="ON"'
         shell: bash
 
       - name: Run Python Tests with Coverage
@@ -128,12 +142,12 @@ jobs:
       - name: Run Cpp Tests
         run: |
           cd "$CLEAN_WORKSPACE/build"
-          make unittest -j$(nproc)
+          make unittest -j$NPROC
         shell: bash
       
       - name: Run Cpp Examples
         run: |
           cd "$CLEAN_WORKSPACE/examples/c++"
           mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release
-          make -j $(nproc) && ./db-example && ./core-example && ./ailego-example
-        shell: bash
+          make -j $NPROC && ./db-example && ./core-example && ./ailego-example
+        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/mac_arm64_ci.yml b/.github/workflows/mac_arm64_ci.yml
index 73aa9227..3d549c29 100644
--- a/.github/workflows/mac_arm64_ci.yml
+++ b/.github/workflows/mac_arm64_ci.yml
@@ -22,24 +22,66 @@ permissions:
 jobs:
   build:
     name: Zvec MacArm64 CI
-    runs-on: mac_m1_arm
+    runs-on: macos-15
+
+    strategy:
+      matrix:
+        python-version: ['3.10']
+      fail-fast: false
 
     steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: 'pyproject.toml'
+
+      - name: Set up environment variables
+        run: |
+          # Set number of processors for parallel builds
+          NPROC=$(sysctl -n hw.ncpu 2>/dev/null || echo 2)
+          echo "NPROC=$NPROC" >> $GITHUB_ENV
+          echo "Using $NPROC parallel jobs for builds"
+          
+          # Add Python user base bin to PATH for pip-installed CLI tools
+          echo "$(python -c 'import site; print(site.USER_BASE)')/bin" >> $GITHUB_PATH
+        shell: bash
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip \
+            ruff==v0.14.4 \
+            clang-format==18.1.8 \
+            pybind11==3.0 \
+            cmake==3.30.0 \
+            ninja==1.11.1 \
+            pytest \
+            pytest-cov \
+            scikit-build-core \
+            setuptools_scm
+        shell: bash
+
       - name: Run Ruff Linter
         run: |
-          cd "$CLEAN_WORKSPACE"
-          ruff check .
+          cd "$GITHUB_WORKSPACE"
+          python -m ruff check .
         shell: bash
 
-      - name: Run Ruff Formatter Check (ensure code is formatted)
+      - name: Run Ruff Formatter Check
         run: |
-          cd "$CLEAN_WORKSPACE"
-          ruff format --check .
+          cd "$GITHUB_WORKSPACE"
+          python -m ruff format --check .
         shell: bash
 
       - name: Run clang-format Check
         run: |
-          cd "$CLEAN_WORKSPACE"
+          cd "$GITHUB_WORKSPACE"
 
           CPP_FILES=$(find . -type f \( -name "*.cpp" -o -name "*.h" -o -name "*.hpp" -o -name "*.cc" -o -name "*.cxx" \) \
             ! -path "./build/*" \
@@ -57,57 +99,34 @@ jobs:
           clang-format --dry-run --Werror $CPP_FILES
         shell: bash
 
-      - name: Prepare clean build directory
+      - name: Build from source
         run: |
-          export CLEAN_WORKSPACE="/tmp/zvec"
-          rm -rf "$CLEAN_WORKSPACE"
-          mkdir -p "$CLEAN_WORKSPACE"
-          cd "$CLEAN_WORKSPACE"
-
-          git config --global --add safe.directory "$CLEAN_WORKSPACE"
-          git clone --recursive "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" .
-
-          if [ -n "${{ github.event.number }}" ]; then
-            git fetch origin "pull/${{ github.event.number }}/head"
-            git checkout FETCH_HEAD
-          else
-            git checkout "${{ github.sha }}"
-          fi
-
-          echo "CLEAN_WORKSPACE=$CLEAN_WORKSPACE" >> $GITHUB_ENV
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        shell: bash
-
-      - name: Install Python dependencies and build package
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          pip install --upgrade pip pytest pytest-cov ruff
-          
-          NPROC=$(nproc 2>/dev/null || echo $(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 2))
-          echo "ParallelGroup: Using $NPROC parallel jobs for build"
+          cd "$GITHUB_WORKSPACE"
           
           CMAKE_GENERATOR="Unix Makefiles" \
           CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \
-          pip install -v . \
+          python -m pip install -v . \
+            --no-build-isolation \
             --config-settings='cmake.define.BUILD_TOOLS="ON"'
         shell: bash
 
-      - name: Run Python Tests with Coverage
+      - name: Run Cpp Tests
         run: |
-          cd "$CLEAN_WORKSPACE"
-          python -m pytest python/tests/ --cov=zvec --cov-report=xml --no-cov-on-fail
+          cd "$GITHUB_WORKSPACE/build"
+          make unittest -j$NPROC
         shell: bash
 
-      - name: Run Cpp Tests with Coverage
+      - name: Run Python Tests with Coverage
         run: |
-          cd "$CLEAN_WORKSPACE/build"
-          make unittest -j 16
+          cd "$GITHUB_WORKSPACE"
+          python -m pytest python/tests/ --cov=zvec --cov-report=xml --no-cov-on-fail
         shell: bash
 
+
       - name: Run Cpp Examples
         run: |
-          cd "$CLEAN_WORKSPACE/examples/c++"
-          mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release
-          make -j 16 && ./db-example && ./core-example && ./ailego-example
+          cd "$GITHUB_WORKSPACE/examples/c++"
+          mkdir build && cd build
+          cmake .. -DCMAKE_BUILD_TYPE=Release
+          make -j $NPROC && ./db-example && ./core-example && ./ailego-example
         shell: bash
diff --git a/pyproject.toml b/pyproject.toml
index de147145..d77eeab2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,7 +80,13 @@ dev = [
 # BUILD SYSTEM CONFIGURATION (scikit-build-core)
 ######################################################################################################
 [build-system]
-requires = ["scikit-build-core >=0.11", "pybind11 >=3.0", "setuptools_scm>=8.0"]
+requires = [
+    "scikit-build-core >=0.11",
+    "pybind11 >=3.0",
+    "setuptools_scm>=8.0",
+    "cmake>=3.26,<4.0",
+    "ninja>=1.11",
+]
 build-backend = "scikit_build_core.build"
 
 [tool.scikit-build]
@@ -165,7 +171,8 @@ archs = ["auto"]
 test-command = "cd {project} && pytest python/tests -v --tb=short"
 manylinux-x86_64-image = "manylinux_2_28"
 manylinux-aarch64-image = "manylinux_2_28"
-skip = "*musllinux*"
+# Skip 32-bit builds and musllinux
+skip = ["*-manylinux_i686", "*-musllinux*"]
 
 [tool.cibuildwheel.macos]
 archs = ["arm64"]
diff --git a/tests/ailego/parallel/thread_queue_test.cc b/tests/ailego/parallel/thread_queue_test.cc
index 6a18b4ee..a7000181 100644
--- a/tests/ailego/parallel/thread_queue_test.cc
+++ b/tests/ailego/parallel/thread_queue_test.cc
@@ -103,7 +103,7 @@ TEST(ThreadQueue, MultiThreadWithHighPriority) {
   }
 
   // Wait for all tasks to complete
-  std::this_thread::sleep_for(std::chrono::seconds(1));
+  std::this_thread::sleep_for(std::chrono::seconds(3));
 
   EXPECT_EQ(count, 1000);
   EXPECT_EQ(high_priority_count, 1000);

From 34e7cedfa4cae6887d8f7a1869255544b35828c4 Mon Sep 17 00:00:00 2001
From: Qinren Zhou <zhouqinren.zqr@alibaba-inc.com>
Date: Fri, 13 Feb 2026 14:54:21 +0800
Subject: [PATCH 18/28] minor: add links to package repository

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 96d91f51..6b87b25b 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@
 
 ## 📦 Installation
 
-### Python
+### [Python](https://pypi.org/project/zvec/)
 
 **Requirements**: Python 3.10 - 3.12
 
@@ -45,7 +45,7 @@
 pip install zvec
 ```
 
-### Node.js
+### [Node.js](https://www.npmjs.com/package/@zvec/zvec)
 
 ```bash
 npm install @zvec/zvec
@@ -72,7 +72,7 @@ schema = zvec.CollectionSchema(
 )
 
 # Create collection
-collection = zvec.create_and_open(path="./zvec_example", schema=schema,)
+collection = zvec.create_and_open(path="./zvec_example", schema=schema)
 
 # Insert documents
 collection.insert([

From a4f3de893a39da6b8631bf6749a334a82dd5f4c1 Mon Sep 17 00:00:00 2001
From: feihongxu0824 <xufeihong.xfh@alibaba-inc.com>
Date: Sun, 15 Feb 2026 22:46:27 +0800
Subject: [PATCH 19/28] chore: add trend badge (#132)

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 6b87b25b..cd9f6866 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,10 @@
   <a href="https://github.com/alibaba/zvec/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-blue.svg" alt="License"/></a>
 </p>
 
+<p align="center">
+  <a href="https://trendshift.io/repositories/20830" target="_blank"><img src="https://trendshift.io/api/badge/repositories/20830" alt="alibaba%2Fzvec | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+
 <p align="center">
   <a href="https://zvec.org/en/docs/quickstart/">🚀 <strong>Quickstart</strong> </a> |
   <a href="https://zvec.org/en/">🏠 <strong>Home</strong> </a> |

From d72a074530a26dba74466d3c1b2e0828ad1a2956 Mon Sep 17 00:00:00 2001
From: ALEXANDRE JUNIO CANUTO LOPES <canutojunio72@gmail.com>
Date: Mon, 16 Feb 2026 22:42:16 -0300
Subject: [PATCH 20/28] docs: fix repository URL in CONTRIBUTING.md (#139)

Changed 'your-org' placeholder to 'alibaba' in the git clone command to reflect the correct repository URL.
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index deff5af9..bf8b9545 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,7 +19,7 @@ By participating, you agree to abide by our [Code of Conduct](CODE_OF_CONDUCT.md
 ### Clone & Initialize
 
 ```bash
-git clone --recursive https://github.com/your-org/zvec.git
+git clone --recursive https://github.com/alibaba/zvec.git
 cd zvec
 ```
 

From 39f04379f2156f68a52514171352dd86b11958c3 Mon Sep 17 00:00:00 2001
From: Maxime Grenu <69890511+cluster2600@users.noreply.github.com>
Date: Fri, 20 Feb 2026 03:06:48 +0100
Subject: [PATCH 21/28] fix(docs): fix typo in README align attr and Python
 version in CONTRIBUTING (#150)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- README.md: remove spurious space in align=" center" → align="center"
  (logo was not centered on GitHub due to invalid HTML attribute value)
- CONTRIBUTING.md: correct Python prerequisite from '>= 3.9' to '3.10 - 3.12'
  to match pyproject.toml classifiers and CI matrix (cp310, cp312)
---
 CONTRIBUTING.md | 2 +-
 README.md       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index bf8b9545..625ab54a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,7 +12,7 @@ By participating, you agree to abide by our [Code of Conduct](CODE_OF_CONDUCT.md
 ## Development Setup
 
 ### Prerequisites
-- Python ≥ 3.9
+- Python 3.10 - 3.12
 - CMake ≥ 3.26, < 4.0 (`cmake --version`)
 - A C++17-compatible compiler (e.g., `g++-11+`, `clang++`, Apple Clang on macOS)
 
diff --git a/README.md b/README.md
index cd9f6866..e3fa5352 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-<div align=" center">
+<div align="center">
   <picture>
     <source media="(prefers-color-scheme: dark)" srcset="https://zvec.oss-cn-hongkong.aliyuncs.com/logo/github_log_2.svg" />
     <img src="https://zvec.oss-cn-hongkong.aliyuncs.com/logo/github_logo_1.svg" width="400" alt="zvec logo" />

From e95619259325b00edbd9736a4d0f73697e6a089b Mon Sep 17 00:00:00 2001
From: Jalin Wang <wangjianning.wjn@alibaba-inc.com>
Date: Wed, 25 Feb 2026 11:22:18 +0800
Subject: [PATCH 22/28] ci: continuous benching (#110)

---
 .github/workflows/continuous_bench.yml | 26 ++++++++
 .github/workflows/scripts/run_vdb.sh   | 88 ++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 .github/workflows/continuous_bench.yml
 create mode 100644 .github/workflows/scripts/run_vdb.sh

diff --git a/.github/workflows/continuous_bench.yml b/.github/workflows/continuous_bench.yml
new file mode 100644
index 00000000..ecf35aa9
--- /dev/null
+++ b/.github/workflows/continuous_bench.yml
@@ -0,0 +1,26 @@
+name: Continuous Benchmark
+on:
+  push:
+    branches: [ "main", "ci/continuous_bench_squash" ]
+    paths-ignore:
+      - '**.md'
+  workflow_dispatch:
+
+concurrency:
+  group: cb-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  benchmark:
+    runs-on: vdbbench
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run VectorDBBench
+        env:
+          DATABASE_URL: ${{ secrets.DATABASE_URL }}
+        run: |
+          bash .github/workflows/scripts/run_vdb.sh
\ No newline at end of file
diff --git a/.github/workflows/scripts/run_vdb.sh b/.github/workflows/scripts/run_vdb.sh
new file mode 100644
index 00000000..f153a598
--- /dev/null
+++ b/.github/workflows/scripts/run_vdb.sh
@@ -0,0 +1,88 @@
+set -e
+
+QUANTIZE_TYPE_LIST="int8 int4 fp16 fp32"
+CASE_TYPE_LIST="Performance768D1M Performance768D10M Performance1536D500K" # respectively test cosine, ip # Performance960D1M l2 metrics
+LOG_FILE="bench.log"
+DATE=$(date +%Y-%m-%d_%H-%M-%S)
+NPROC=$(nproc 2>/dev/null || getconf _NPROCESSORS_ONLN 2>/dev/null || echo 2)
+
+# COMMIT_ID = branch-date-sha
+COMMIT_ID=${GITHUB_REF_NAME}-"$DATE"-$(echo ${GITHUB_WORKFLOW_SHA} | cut -c1-8)
+COMMIT_ID=$(echo "$COMMIT_ID" | sed 's/\//_/g')
+echo "COMMIT_ID: $COMMIT_ID"
+echo "GITHUB_WORKFLOW_SHA: $GITHUB_WORKFLOW_SHA"
+echo "workspace: $GITHUB_WORKSPACE"
+DB_LABEL_PREFIX="Zvec16c64g-$COMMIT_ID"
+
+# install zvec
+git submodule update --init
+
+# for debug
+#cd ..
+#export SKBUILD_BUILD_DIR="$GITHUB_WORKSPACE/../build"
+pwd
+
+python3 -m venv .venv
+source .venv/bin/activate
+pip install cmake ninja psycopg2-binary loguru fire
+pip install -e /opt/VectorDBBench
+
+CMAKE_GENERATOR="Unix Makefiles" \
+CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \
+pip install -v "$GITHUB_WORKSPACE"
+
+for CASE_TYPE in $CASE_TYPE_LIST; do
+    echo "Running VectorDBBench for $CASE_TYPE"
+    DATASET_DESC=""
+    if [ "$CASE_TYPE" == "Performance768D1M" ]; then
+        DATASET_DESC="Performance768D1M - Cohere Cosine"
+    elif [ "$CASE_TYPE" == "Performance768D10M" ]; then
+        DATASET_DESC="Performance768D10M - Cohere Cosine"
+    else
+        DATASET_DESC="Performance1536D500K - OpenAI IP"
+    fi
+
+    for QUANTIZE_TYPE in $QUANTIZE_TYPE_LIST; do
+        DB_LABEL="$DB_LABEL_PREFIX-$CASE_TYPE-$QUANTIZE_TYPE"
+        echo "Running VectorDBBench for $DB_LABEL"
+
+        VDB_PARAMS="--path ${DB_LABEL} --db-label ${DB_LABEL} --case-type ${CASE_TYPE} --num-concurrency 12,14,16,18,20"
+        if [ "$CASE_TYPE" == "Performance768D1M" ]; then
+            VDB_PARAMS="${VDB_PARAMS} --m 15 --ef-search 180"
+        elif [ "$CASE_TYPE" == "Performance768D10M" ]; then
+            VDB_PARAMS="${VDB_PARAMS} --m 50 --ef-search 118 --is-using-refiner"
+        else #Performance1536D500K using default params + refiner to monitor performance degradation
+            VDB_PARAMS="${VDB_PARAMS} --m 50 --ef-search 100 --is-using-refiner"
+        fi
+
+        if [ "$QUANTIZE_TYPE" == "fp32" ]; then
+            vectordbbench zvec ${VDB_PARAMS} 2>&1 | tee $LOG_FILE
+        else
+            vectordbbench zvec ${VDB_PARAMS} --quantize-type "${QUANTIZE_TYPE}" 2>&1 | tee $LOG_FILE
+        fi
+
+        RESULT_JSON_PATH=$(grep -o "/opt/VectorDBBench/.*\.json" $LOG_FILE)
+        QPS=$(jq -r '.results[0].metrics.qps' "$RESULT_JSON_PATH")
+        RECALL=$(jq -r '.results[0].metrics.recall' "$RESULT_JSON_PATH")
+        LATENCY_P99=$(jq -r '.results[0].metrics.serial_latency_p99' "$RESULT_JSON_PATH")
+        LOAD_DURATION=$(jq -r '.results[0].metrics.load_duration' "$RESULT_JSON_PATH")
+
+        #quote the var to avoid space in the label
+        label_list="case_type=\"${CASE_TYPE}\",dataset_desc=\"${DATASET_DESC}\",db_label=\"${DB_LABEL}\",commit=\"${COMMIT_ID}\",date=\"${DATE}\",quantize_type=\"${QUANTIZE_TYPE}\""
+        # replace `/` with `_` in label_list
+        label_list=$(echo "$label_list" | sed 's/\//_/g')
+        cat <<EOF > prom_metrics.txt
+        # TYPE vdb_bench_qps gauge
+        vdb_bench_qps{$label_list} $QPS
+        # TYPE vdb_bench_recall gauge
+        vdb_bench_recall{$label_list} $RECALL
+        # TYPE vdb_bench_latency_p99 gauge
+        vdb_bench_latency_p99{$label_list} $LATENCY_P99
+        # TYPE vdb_bench_load_duration gauge
+        vdb_bench_load_duration{$label_list} $LOAD_DURATION
+EOF
+        echo "prom_metrics:"
+        cat prom_metrics.txt
+        curl --data-binary @prom_metrics.txt "http://47.93.34.27:9091/metrics/job/benchmarks-${CASE_TYPE}/case_type/${CASE_TYPE}/quantize_type/${QUANTIZE_TYPE}" -v
+    done
+done
\ No newline at end of file

From 779c63d0d98cb23f2f59f61bc3f89c8f5436326b Mon Sep 17 00:00:00 2001
From: Cuiys <cuiyushuai.cys@alibaba-inc.com>
Date: Wed, 25 Feb 2026 13:54:21 +0800
Subject: [PATCH 23/28] docs: adjust join us in the readme. (#168)

* docs: remove x.com in join us

* docs: join discord with widget
---
 README.md | 32 ++++++++------------------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index e3fa5352..bfdd30e5 100644
--- a/README.md
+++ b/README.md
@@ -108,30 +108,14 @@ For detailed benchmark methodology, configurations, and complete results, please
 
 Stay updated and get support — scan or click:
 
-<table align="center" style="border-collapse: collapse; margin: 16px auto; width: 100%; max-width: 520px;">
-  <tr>
-    <td align="center" style="padding: 8px; width: 25%;">
-      <div style="font-weight: 600; font-size: 14px; margin-bottom: 6px;">💬 DingTalk</div>
-      <img src="https://zvec.oss-cn-hongkong.aliyuncs.com/qrcode/dingding.png" alt="DingTalk QR Code" width="100" style="border-radius: 8px; border: 1px solid #ddd;">
-    </td>
-    <td align="center" style="padding: 8px; width: 25%;">
-      <div style="font-weight: 600; font-size: 14px; margin-bottom: 6px;">📱 WeChat</div>
-      <img src="https://zvec.oss-cn-hongkong.aliyuncs.com/qrcode/wechat.png" alt="WeChat QR Code" width="100" style="border-radius: 8px; border: 1px solid #ddd;">
-    </td>
-    <td align="center" style="padding: 8px; width: 25%;">
-      <div style="font-weight: 600; font-size: 14px; margin-bottom: 6px;">🎮 Discord</div>
-      <a href="https://discord.gg/rKddFBBu9z" target="_blank" style="display: inline-block; width: 100px; height: 100px; background: #5865F2; border-radius: 8px; text-decoration: none; color: white; font-size: 12px; display: flex; align-items: center; justify-content: center; line-height: 1;">
-        Join Server
-      </a>
-    </td>
-    <td align="center" style="padding: 8px; width: 25%;">
-      <div style="font-weight: 600; font-size: 14px; margin-bottom: 6px;">🐦 X (Twitter)</div>
-      <a href="https://x.com/zvec_ai" target="_blank" style="display: inline-block; width: 100px; height: 100px; background: #000; border-radius: 8px; text-decoration: none; color: white; font-size: 12px; display: flex; align-items: center; justify-content: center; line-height: 1;">
-        Follow @zvec_ai
-      </a>
-    </td>
-  </tr>
-</table>
+<div align="center">
+
+| 💬 DingTalk | 📱 WeChat | 🎮 Discord |
+|:---:|:---:|:---:|
+| <img src="https://zvec.oss-cn-hongkong.aliyuncs.com/qrcode/dingding.png" width="150"/> | <img src="https://zvec.oss-cn-hongkong.aliyuncs.com/qrcode/wechat.png?v=2" width="150"/> | [![Discord](https://img.shields.io/badge/Discord-Join%20Server-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/rKddFBBu9z) |
+| Scan to join | Scan to join | Click to join |
+
+</div>
 
 </div>
 

From a7c6aa19f152af09413d9ea3af77ee7496d64a08 Mon Sep 17 00:00:00 2001
From: Jason Yao <940334249@qq.com>
Date: Wed, 25 Feb 2026 18:03:20 +0800
Subject: [PATCH 24/28] chore: enable the conventional-pre-commit run sucess
 and update to latest version (#111)

---
 .pre-commit-config.yaml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index abe63c6b..39808c89 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,7 @@
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
+  
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.14.4
@@ -31,14 +35,11 @@ repos:
 
 
   - repo: https://github.com/compilerla/conventional-pre-commit
-    rev: v3.0.0
+    rev: v4.3.0
     hooks:
       - id: conventional-pre-commit
-        stages: [ commit-msg ]
-        args: [
-          --types, feat,fix,docs,style,refactor,test,chore,perf,ci,build,revert,
-          --scope-optional
-        ]
+        stages: [commit-msg]
+        args: [--verbose]
 
 
   - repo: local

From fc988e395733cf42434b5c688b657aa6e713a8e8 Mon Sep 17 00:00:00 2001
From: Salman Chishti <salmanmkc@GitHub.com>
Date: Wed, 25 Feb 2026 10:09:55 +0000
Subject: [PATCH 25/28] Upgrade GitHub Actions for Node 24 compatibility (#129)

Signed-off-by: Salman Muin Kayser Chishti <13schishti@gmail.com>
---
 .github/workflows/build_test_wheel.yml | 8 ++++----
 .github/workflows/build_wheel.yml      | 8 ++++----
 .github/workflows/mac_arm64_ci.yml     | 4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build_test_wheel.yml b/.github/workflows/build_test_wheel.yml
index 918a3da5..8636d5e2 100644
--- a/.github/workflows/build_test_wheel.yml
+++ b/.github/workflows/build_test_wheel.yml
@@ -13,12 +13,12 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           submodules: recursive
 
       - name: Set up Python (for cibuildwheel controller)
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.11'
 
@@ -61,12 +61,12 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           submodules: recursive
 
       - name: Set up Python (for cibuildwheel controller)
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.11'
 
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index b56af990..21cf3c40 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -13,12 +13,12 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           submodules: recursive
 
       - name: Set up Python (for cibuildwheel controller)
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.11'
 
@@ -63,12 +63,12 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           submodules: recursive
 
       - name: Set up Python (for cibuildwheel controller)
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.11'
 
diff --git a/.github/workflows/mac_arm64_ci.yml b/.github/workflows/mac_arm64_ci.yml
index 3d549c29..5437000b 100644
--- a/.github/workflows/mac_arm64_ci.yml
+++ b/.github/workflows/mac_arm64_ci.yml
@@ -31,12 +31,12 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           submodules: recursive
 
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: ${{ matrix.python-version }}
           cache: 'pip'

From 49e2d349758900605f7519aff1846e03cb342b0a Mon Sep 17 00:00:00 2001
From: Salman Chishti <salmanmkc@GitHub.com>
Date: Wed, 25 Feb 2026 11:01:39 +0000
Subject: [PATCH 26/28] Upgrade GitHub Actions to latest versions (#130)

Signed-off-by: Salman Muin Kayser Chishti <13schishti@gmail.com>
---
 .github/workflows/nightly_coverage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nightly_coverage.yml b/.github/workflows/nightly_coverage.yml
index b9642716..dcba2d5d 100644
--- a/.github/workflows/nightly_coverage.yml
+++ b/.github/workflows/nightly_coverage.yml
@@ -91,7 +91,7 @@ jobs:
         shell: bash
 
       - name: Upload Coverage to Codecov
-        uses: codecov/codecov-action@v4
+        uses: codecov/codecov-action@v5
         with:
           files: ${{ env.CLEAN_WORKSPACE }}/proxima-zvec-filtered.lcov.info,${{ env.CLEAN_WORKSPACE }}/coverage.xml
           flags: python,cpp,nightly

From 1dfeda64aef39b4d45120a8ebdaa55208f2e832f Mon Sep 17 00:00:00 2001
From: Cuiys <cuiyushuai.cys@alibaba-inc.com>
Date: Thu, 26 Feb 2026 10:47:55 +0800
Subject: [PATCH 27/28] feat(ci): ci workflow with github-hosted runner (#171)

* feat: linux ci with github runner

* fix: add CFLAGS with -march=native

* fix: fix linux x86 ci

* fix: with zen3 in x86 ci

* feat: nightly report with github-runner

* feat: refactor ci workflow

* feat: rename workflow

* feat: rename job

* feat: update ci badge

* feat: remove python coverage
---
 .github/workflows/continuous_bench.yml        |   2 +-
 .github/workflows/linux_arm64_docker_ci.yml   | 148 -----------------
 .github/workflows/linux_x64_docker_ci.yml     | 153 ------------------
 .../workflows/{mac_arm64_ci.yml => main.yml}  | 133 +++++++++------
 .github/workflows/nightly_coverage.yml        |  86 +++++-----
 README.md                                     |   8 +-
 6 files changed, 126 insertions(+), 404 deletions(-)
 delete mode 100644 .github/workflows/linux_arm64_docker_ci.yml
 delete mode 100644 .github/workflows/linux_x64_docker_ci.yml
 rename .github/workflows/{mac_arm64_ci.yml => main.yml} (56%)

diff --git a/.github/workflows/continuous_bench.yml b/.github/workflows/continuous_bench.yml
index ecf35aa9..34fe527e 100644
--- a/.github/workflows/continuous_bench.yml
+++ b/.github/workflows/continuous_bench.yml
@@ -17,7 +17,7 @@ jobs:
   benchmark:
     runs-on: vdbbench
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - name: Run VectorDBBench
         env:
diff --git a/.github/workflows/linux_arm64_docker_ci.yml b/.github/workflows/linux_arm64_docker_ci.yml
deleted file mode 100644
index 4e6b61cf..00000000
--- a/.github/workflows/linux_arm64_docker_ci.yml
+++ /dev/null
@@ -1,148 +0,0 @@
-name: Zvec LinuxARM64 CI
-
-on:
-  push:
-    branches: [ "main" ]
-    paths-ignore:
-      - '**.md'
-  merge_group:
-  pull_request:
-    branches: [ "main" ]
-    paths-ignore:
-      - '**.md'
-  workflow_dispatch:
-
-concurrency:
-  group: pr-${{ github.workflow }}-${{ github.event.pull_request.number }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  build:
-    name: Zvec LinuxARM64 CI
-    runs-on: linux_arm64
-
-    strategy:
-      matrix:
-        python-version: ['3.10']
-      fail-fast: false
-
-    container:
-      image: quay.io/pypa/manylinux_2_28_aarch64:2024-03-10-4935fcc
-      options: --user root
-
-    steps:
-      - name: Set up Python path for manylinux
-        run: |
-          case "${{ matrix.python-version }}" in
-            "3.10") PY_PATH="/opt/python/cp310-cp310" ;;
-            "3.11") PY_PATH="/opt/python/cp311-cp311" ;;
-            "3.12") PY_PATH="/opt/python/cp312-cp312" ;;
-            *) echo "Unsupported Python version: ${{ matrix.python-version }}"; exit 1 ;;
-          esac
-          echo "PYTHON_BIN=$PY_PATH/bin/python" >> $GITHUB_ENV
-          echo "PIP_BIN=$PY_PATH/bin/pip" >> $GITHUB_ENV
-          echo "CLANG_FORMATTER_BIN=$PY_PATH/bin/clang-format" >> $GITHUB_ENV
-          $PY_PATH/bin/python --version
-        shell: bash
-
-      - name: Prepare clean build directory
-        run: |
-          export CLEAN_WORKSPACE="/tmp/zvec"
-          mkdir -p "$CLEAN_WORKSPACE"
-          cd "$CLEAN_WORKSPACE"
-
-          git config --global --add safe.directory "$CLEAN_WORKSPACE"
-          git clone --recursive "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" .
-
-          if [ -n "${{ github.event.number }}" ]; then
-            git fetch origin "pull/${{ github.event.number }}/head"
-            git checkout FETCH_HEAD
-          else
-            git checkout "${{ github.sha }}"
-          fi
-
-          echo "CLEAN_WORKSPACE=$CLEAN_WORKSPACE" >> $GITHUB_ENV
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        shell: bash
-
-      - name: Install dependencies
-        run: |
-          ${{ env.PIP_BIN }} install --upgrade pip \
-            ruff==v0.14.4 \
-            clang-format==18.1.8 \
-            pybind11==3.0 \
-            cmake==3.30.0 \
-            ninja==1.11.1 \
-            pytest \
-            pytest-cov \
-            scikit-build-core \
-            setuptools_scm
-        shell: bash
-
-      - name: Run Ruff Linter
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          ${{ env.PYTHON_BIN }} -m ruff check .
-        shell: bash
-
-      - name: Run Ruff Formatter Check
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          ${{ env.PYTHON_BIN }} -m ruff format --check .
-        shell: bash
-
-      - name: Run clang-format Check
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          
-
-          CPP_FILES=$(find . -type f \( -name "*.cpp" -o -name "*.h" -o -name "*.hpp" -o -name "*.cc" -o -name "*.cxx" \) \
-            ! -path "./build/*" \
-            ! -path "./tests/*" \
-            ! -path "./scripts/*" \
-            ! -path "./python/*" \
-            ! -path "./thirdparty/*" \
-            ! -path "./.git/*")
-
-          if [ -z "$CPP_FILES" ]; then
-            echo "No C++ files found to check."
-            exit 0
-          fi
-
-          ${{ env.CLANG_FORMATTER_BIN }} --dry-run --Werror $CPP_FILES
-        shell: bash
-
-      - name: Install Python dependencies and build package
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          NPROC=$(nproc 2>/dev/null || getconf _NPROCESSORS_ONLN 2>/dev/null || echo 2)
-
-          CMAKE_GENERATOR="Unix Makefiles" \
-          CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \
-          ${{ env.PIP_BIN }} install -v . \
-            --no-build-isolation \
-            --config-settings='cmake.define.BUILD_TOOLS="ON"'
-        shell: bash
-
-      - name: Run Python Tests with Coverage
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          ${{ env.PYTHON_BIN }} -m pytest python/tests/ --cov=zvec --cov-report=xml --no-cov-on-fail
-        shell: bash
-
-      - name: Run Cpp Tests
-        run: |
-          cd "$CLEAN_WORKSPACE/build"
-          make unittest -j$(nproc)
-        shell: bash
-
-      - name: Run Cpp Examples
-        run: |
-          cd "$CLEAN_WORKSPACE/examples/c++"
-          mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release
-          make -j $(nproc) && ./db-example && ./core-example && ./ailego-example
-        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/linux_x64_docker_ci.yml b/.github/workflows/linux_x64_docker_ci.yml
deleted file mode 100644
index f1fc3c7d..00000000
--- a/.github/workflows/linux_x64_docker_ci.yml
+++ /dev/null
@@ -1,153 +0,0 @@
-name: Zvec LinuxX64 CI
-
-on:
-  push:
-    branches: [ "main" ]
-    paths-ignore:
-      - '**.md'
-  merge_group:
-  pull_request:
-    branches: [ "main" ]
-    paths-ignore:
-      - '**.md'
-  workflow_dispatch:
-
-concurrency:
-  group: pr-${{ github.workflow }}-${{ github.event.pull_request.number }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  build:
-    name: Zvec LinuxX64 CI
-    runs-on: linux_x64
-
-    strategy:
-      matrix:
-        python-version: ['3.10']
-      fail-fast: false
-
-    container:
-      image: quay.io/pypa/manylinux_2_28_x86_64:2024-03-10-4935fcc
-      options: --user root
-
-    steps:
-      - name: Set up Python path for manylinux
-        run: |
-          case "${{ matrix.python-version }}" in
-            "3.10") PY_PATH="/opt/python/cp310-cp310" ;;
-            "3.11") PY_PATH="/opt/python/cp311-cp311" ;;
-            "3.12") PY_PATH="/opt/python/cp312-cp312" ;;
-            *) echo "Unsupported Python version: ${{ matrix.python-version }}"; exit 1 ;;
-          esac
-          echo "PYTHON_BIN=$PY_PATH/bin/python" >> $GITHUB_ENV
-          echo "PIP_BIN=$PY_PATH/bin/pip" >> $GITHUB_ENV
-          echo "CLANG_FORMATTER_BIN=$PY_PATH/bin/clang-format" >> $GITHUB_ENV
-          $PY_PATH/bin/python --version
-
-          # Set number of processors for parallel builds
-          NPROC=$(nproc 2>/dev/null || getconf _NPROCESSORS_ONLN 2>/dev/null || echo 2)
-          echo "NPROC=$NPROC" >> $GITHUB_ENV
-          echo "Using $NPROC parallel jobs for builds"
-          
-          # Add Python user base bin to PATH for pip-installed CLI tools
-          echo "$(python -c 'import site; print(site.USER_BASE)')/bin" >> $GITHUB_PATH
-        shell: bash
-      
-      - name: Install dependencies
-        run: |
-           ${{ env.PYTHON_BIN }} -m pip install --upgrade pip \
-            ruff==v0.14.4 \
-            clang-format==18.1.8 \
-            pybind11==3.0 \
-            cmake==3.30.0 \
-            ninja==1.11.1 \
-            pytest \
-            pytest-cov \
-            scikit-build-core \
-            setuptools_scm
-        shell: bash
-
-      - name: Prepare clean build directory
-        run: |
-          export CLEAN_WORKSPACE="/tmp/zvec"
-          mkdir -p "$CLEAN_WORKSPACE"
-          cd "$CLEAN_WORKSPACE"
-
-          git config --global --add safe.directory "$CLEAN_WORKSPACE"
-          git clone --recursive "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" .
-
-          if [ -n "${{ github.event.number }}" ]; then
-            git fetch origin "pull/${{ github.event.number }}/head"
-            git checkout FETCH_HEAD
-          else
-            git checkout "${{ github.sha }}"
-          fi
-
-          echo "CLEAN_WORKSPACE=$CLEAN_WORKSPACE" >> $GITHUB_ENV
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        shell: bash
-
-      - name: Run Ruff Linter
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          ${{ env.PYTHON_BIN }} -m ruff check .
-        shell: bash
-
-      - name: Run Ruff Formatter Check
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          ${{ env.PYTHON_BIN }} -m ruff format --check .
-        shell: bash
-
-      - name: Run clang-format Check
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          
-          CPP_FILES=$(find . -type f \( -name "*.cpp" -o -name "*.h" -o -name "*.hpp" -o -name "*.cc" -o -name "*.cxx" \) \
-            ! -path "./build/*" \
-            ! -path "./tests/*" \
-            ! -path "./scripts/*" \
-            ! -path "./python/*" \
-            ! -path "./thirdparty/*" \
-            ! -path "./.git/*")
-
-          if [ -z "$CPP_FILES" ]; then
-            echo "No C++ files found to check."
-            exit 0
-          fi
-
-          ${{ env.CLANG_FORMATTER_BIN }} --dry-run --Werror $CPP_FILES
-        shell: bash
-
-      - name: Install Python dependencies and build package
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          CMAKE_GENERATOR="Unix Makefiles" \
-          CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \
-          ${{ env.PYTHON_BIN }} -m pip install -v . \
-            --no-build-isolation \
-            --config-settings='cmake.define.BUILD_TOOLS="ON"'
-        shell: bash
-
-      - name: Run Python Tests with Coverage
-        run: |
-          cd "$CLEAN_WORKSPACE"
-          ${{ env.PYTHON_BIN }} -m pytest python/tests/ --cov=zvec --cov-report=xml --no-cov-on-fail
-        shell: bash
-
-      - name: Run Cpp Tests
-        run: |
-          cd "$CLEAN_WORKSPACE/build"
-          make unittest -j$NPROC
-        shell: bash
-      
-      - name: Run Cpp Examples
-        run: |
-          cd "$CLEAN_WORKSPACE/examples/c++"
-          mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release
-          make -j $NPROC && ./db-example && ./core-example && ./ailego-example
-        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/mac_arm64_ci.yml b/.github/workflows/main.yml
similarity index 56%
rename from .github/workflows/mac_arm64_ci.yml
rename to .github/workflows/main.yml
index 5437000b..a045c6d6 100644
--- a/.github/workflows/mac_arm64_ci.yml
+++ b/.github/workflows/main.yml
@@ -1,4 +1,4 @@
-name: Zvec MacArm64 CI
+name: Main
 
 on:
   push:
@@ -13,76 +13,45 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: pr-${{ github.workflow }}-${{ github.event.pull_request.number }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
   cancel-in-progress: true
 
 permissions:
   contents: read
 
 jobs:
-  build:
-    name: Zvec MacArm64 CI
-    runs-on: macos-15
-
-    strategy:
-      matrix:
-        python-version: ['3.10']
-      fail-fast: false
-
+  # Code quality checks (fast, run first)
+  lint:
+    name: Code Quality Checks
+    runs-on: ubuntu-24.04
     steps:
       - name: Checkout code
         uses: actions/checkout@v6
-        with:
-          submodules: recursive
 
       - name: Set up Python
         uses: actions/setup-python@v6
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: '3.10'
           cache: 'pip'
           cache-dependency-path: 'pyproject.toml'
 
-      - name: Set up environment variables
-        run: |
-          # Set number of processors for parallel builds
-          NPROC=$(sysctl -n hw.ncpu 2>/dev/null || echo 2)
-          echo "NPROC=$NPROC" >> $GITHUB_ENV
-          echo "Using $NPROC parallel jobs for builds"
-          
-          # Add Python user base bin to PATH for pip-installed CLI tools
-          echo "$(python -c 'import site; print(site.USER_BASE)')/bin" >> $GITHUB_PATH
-        shell: bash
-
-      - name: Install dependencies
+      - name: Install linting tools
         run: |
           python -m pip install --upgrade pip \
             ruff==v0.14.4 \
-            clang-format==18.1.8 \
-            pybind11==3.0 \
-            cmake==3.30.0 \
-            ninja==1.11.1 \
-            pytest \
-            pytest-cov \
-            scikit-build-core \
-            setuptools_scm
+            clang-format==18.1.8
         shell: bash
 
       - name: Run Ruff Linter
-        run: |
-          cd "$GITHUB_WORKSPACE"
-          python -m ruff check .
+        run: python -m ruff check .
         shell: bash
 
       - name: Run Ruff Formatter Check
-        run: |
-          cd "$GITHUB_WORKSPACE"
-          python -m ruff format --check .
+        run: python -m ruff format --check .
         shell: bash
 
       - name: Run clang-format Check
         run: |
-          cd "$GITHUB_WORKSPACE"
-
           CPP_FILES=$(find . -type f \( -name "*.cpp" -o -name "*.h" -o -name "*.hpp" -o -name "*.cc" -o -name "*.cxx" \) \
             ! -path "./build/*" \
             ! -path "./tests/*" \
@@ -99,6 +68,67 @@ jobs:
           clang-format --dry-run --Werror $CPP_FILES
         shell: bash
 
+  # Build and test matrix (parallel execution)
+  build-and-test:
+    name: Build & Test (${{ matrix.platform }})
+    needs: lint
+    runs-on: ${{ matrix.os }}
+    
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: macos-15
+            platform: macos-arm64
+            arch_flag: ""  # ARM64 uses auto-detection
+          - os: ubuntu-24.04-arm
+            platform: linux-arm64
+            arch_flag: ""  # ARM64 uses auto-detection
+          - os: ubuntu-24.04
+            platform: linux-x64
+            # FIXME: ENABLE_ZEN3 is hardcoded for the current GitHub-hosted runner (AMD EPYC 7T83).
+            # This should be removed once #101 is resolved.
+            arch_flag: "--config-settings='cmake.define.ENABLE_ZEN3=\"ON\"'"
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+          cache-dependency-path: 'pyproject.toml'
+
+      - name: Set up environment variables
+        run: |
+          # Set number of processors for parallel builds
+          if [[ "${{ matrix.platform }}" == "macos-arm64" ]]; then
+            NPROC=$(sysctl -n hw.ncpu 2>/dev/null || echo 2)
+          else
+            NPROC=$(nproc 2>/dev/null || echo 2)
+          fi
+          echo "NPROC=$NPROC" >> $GITHUB_ENV
+          echo "Using $NPROC parallel jobs for builds"
+          
+          # Add Python user base bin to PATH for pip-installed CLI tools
+          echo "$(python -c 'import site; print(site.USER_BASE)')/bin" >> $GITHUB_PATH
+        shell: bash
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip \
+            pybind11==3.0 \
+            cmake==3.30.0 \
+            ninja==1.11.1 \
+            pytest \
+            scikit-build-core \
+            setuptools_scm
+        shell: bash
+
       - name: Build from source
         run: |
           cd "$GITHUB_WORKSPACE"
@@ -107,26 +137,29 @@ jobs:
           CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \
           python -m pip install -v . \
             --no-build-isolation \
-            --config-settings='cmake.define.BUILD_TOOLS="ON"'
+            --config-settings='cmake.define.BUILD_TOOLS="ON"' \
+            ${{ matrix.arch_flag }}
         shell: bash
 
-      - name: Run Cpp Tests
+      - name: Run C++ Tests
         run: |
           cd "$GITHUB_WORKSPACE/build"
           make unittest -j$NPROC
         shell: bash
 
-      - name: Run Python Tests with Coverage
+      - name: Run Python Tests
         run: |
           cd "$GITHUB_WORKSPACE"
-          python -m pytest python/tests/ --cov=zvec --cov-report=xml --no-cov-on-fail
+          python -m pytest python/tests/
         shell: bash
 
-
-      - name: Run Cpp Examples
+      - name: Run C++ Examples
         run: |
           cd "$GITHUB_WORKSPACE/examples/c++"
           mkdir build && cd build
           cmake .. -DCMAKE_BUILD_TYPE=Release
-          make -j $NPROC && ./db-example && ./core-example && ./ailego-example
-        shell: bash
+          make -j $NPROC
+          ./db-example
+          ./core-example
+          ./ailego-example
+        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/nightly_coverage.yml b/.github/workflows/nightly_coverage.yml
index dcba2d5d..e100bf4a 100644
--- a/.github/workflows/nightly_coverage.yml
+++ b/.github/workflows/nightly_coverage.yml
@@ -13,78 +13,72 @@ permissions:
 jobs:
   coverage:
     name: Nightly Coverage Report
-    runs-on: linux_x64
+    runs-on: ubuntu-24.04
 
     strategy:
       matrix:
         python-version: ['3.10']
       fail-fast: false
 
-    container:
-      image: zvec-registry.cn-hongkong.cr.aliyuncs.com/zvec/zvec:0.0.2
-      options: --user root
-
     steps:
-      - name: Activate Conda environment
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          ref: main  # Always use main for nightly
+          submodules: recursive
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: 'pyproject.toml'
+
+      - name: Set up environment variables
         run: |
-          if [[ "${{ matrix.python-version }}" == "3.10" ]]; then
-            ENV_NAME="py310"
-          elif [[ "${{ matrix.python-version }}" == "3.11" ]]; then
-            ENV_NAME="py311"
-          elif [[ "${{ matrix.python-version }}" == "3.12" ]]; then
-            ENV_NAME="py312"
-          else
-            echo "Unsupported Python version"
-            exit 1
-          fi
-          echo "CONDA_ENV_NAME=$ENV_NAME" >> $GITHUB_ENV
-          source /opt/miniforge3/bin/activate "$ENV_NAME"
-          python --version
+          # Set number of processors for parallel builds
+          NPROC=$(nproc 2>/dev/null || echo 2)
+          echo "NPROC=$NPROC" >> $GITHUB_ENV
+          echo "Using $NPROC parallel jobs for builds"
+          
+          # Add Python user base bin to PATH for pip-installed CLI tools
+          echo "$(python -c 'import site; print(site.USER_BASE)')/bin" >> $GITHUB_PATH
         shell: bash
 
-      - name: Prepare clean build directory
+      - name: Install dependencies
         run: |
-          export CLEAN_WORKSPACE="/tmp/zvec"
-          mkdir -p "$CLEAN_WORKSPACE"
-          cd "$CLEAN_WORKSPACE"
-
-          git config --global --add safe.directory "$CLEAN_WORKSPACE"
-          git clone --recursive "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" .
-
-          git checkout main  # Always use main for nightly
-
-          echo "CLEAN_WORKSPACE=$CLEAN_WORKSPACE" >> $GITHUB_ENV
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          python -m pip install --upgrade pip \
+            cmake==3.30.0 \
+            ninja==1.11.1 \
+            pytest \
+            pytest-cov \
+            scikit-build-core \
+            setuptools_scm
         shell: bash
 
       - name: Build with COVERAGE config
         run: |
-          source /opt/miniforge3/bin/activate "${{ env.CONDA_ENV_NAME }}"
-          cd "$CLEAN_WORKSPACE"
-          pip install --upgrade pip pytest pytest-cov
-          
-          NPROC=$(nproc 2>/dev/null || echo 2)
-          echo "Using $NPROC parallel jobs"
+          cd "$GITHUB_WORKSPACE"
           
           CMAKE_GENERATOR="Unix Makefiles" \
           CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \
-          pip install -v . \
-            --config-settings="cmake.build-type=COVERAGE"
+          python -m pip install -v . \
+            --no-build-isolation \
+            --config-settings="cmake.build-type=COVERAGE" \
+            --config-settings='cmake.define.ENABLE_ZEN3="ON"'
         shell: bash
 
       - name: Run Python Tests with Coverage
         run: |
-          source /opt/miniforge3/bin/activate "${{ env.CONDA_ENV_NAME }}"
-          cd "$CLEAN_WORKSPACE"
+          cd "$GITHUB_WORKSPACE"
           python -m pytest python/tests/ --cov=zvec --cov-report=xml
         shell: bash
 
       - name: Run C++ Tests and Generate Coverage
         run: |
-          cd "$CLEAN_WORKSPACE/build"
-          make unittest -j$(nproc)  # Run all (nightly can afford it)
-          cd "$CLEAN_WORKSPACE"
+          cd "$GITHUB_WORKSPACE/build"
+          make unittest -j$NPROC
+          cd "$GITHUB_WORKSPACE"
           # Ensure gcov.sh is executable
           chmod +x scripts/gcov.sh
           bash scripts/gcov.sh -k
@@ -93,7 +87,7 @@ jobs:
       - name: Upload Coverage to Codecov
         uses: codecov/codecov-action@v5
         with:
-          files: ${{ env.CLEAN_WORKSPACE }}/proxima-zvec-filtered.lcov.info,${{ env.CLEAN_WORKSPACE }}/coverage.xml
+          files: ./proxima-zvec-filtered.lcov.info,./coverage.xml
           flags: python,cpp,nightly
           name: nightly-linux-py${{ matrix.python-version }}
           token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/README.md b/README.md
index bfdd30e5..226d4f15 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,8 @@
 </div>
 
 <p align="center">
-  <a href="https://github.com/alibaba/zvec/actions/workflows/linux_x64_docker_ci.yml"><img src="https://github.com/alibaba/zvec/actions/workflows/linux_x64_docker_ci.yml/badge.svg?branch=main" alt="Linux x64 CI"/></a>
-  <a href="https://github.com/alibaba/zvec/actions/workflows/linux_arm64_docker_ci.yml"><img src="https://github.com/alibaba/zvec/actions/workflows/linux_arm64_docker_ci.yml/badge.svg?branch=main" alt="Linux ARM64 CI"/></a>
-  <a href="https://github.com/alibaba/zvec/actions/workflows/mac_arm64_ci.yml"><img src="https://github.com/alibaba/zvec/actions/workflows/mac_arm64_ci.yml/badge.svg?branch=main" alt="macOS ARM64 CI"/></a>
-  <br>
   <a href="https://codecov.io/github/alibaba/zvec"><img src="https://codecov.io/github/alibaba/zvec/graph/badge.svg?token=O81CT45B66" alt="Code Coverage"/></a>
+  <a href="https://github.com/alibaba/zvec/actions/workflows/main.yml"><img src="https://github.com/alibaba/zvec/actions/workflows/main.yml/badge.svg?branch=main" alt="Main"/></a>
   <a href="https://pypi.org/project/zvec/"><img src="https://img.shields.io/pypi/v/zvec.svg" alt="PyPI Release"/></a>
   <a href="https://pypi.org/project/zvec/"><img src="https://img.shields.io/pypi/pyversions/zvec.svg" alt="Python Versions"/></a>
   <a href="https://github.com/alibaba/zvec/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-blue.svg" alt="License"/></a>
@@ -25,8 +22,7 @@
   <a href="https://zvec.org/en/">🏠 <strong>Home</strong> </a> |
   <a href="https://zvec.org/en/docs/">📚 <strong>Docs</strong> </a> |
   <a href="https://zvec.org/en/docs/benchmarks/">📊 <strong>Benchmarks</strong> </a> |
-  <a href="https://discord.gg/rKddFBBu9z">🎮 <strong>Discord</strong> </a> |
-  <a href="https://x.com/zvec_ai">🐦 <strong>X (Twitter)</strong> </a>
+  <a href="https://discord.gg/rKddFBBu9z">🎮 <strong>Discord</strong> </a>
 </p>
 
 **Zvec** is an open-source, in-process vector database — lightweight, lightning-fast, and designed to embed directly into applications. Built on **Proxima** (Alibaba's battle-tested vector search engine), it delivers production-grade, low-latency, scalable similarity search with minimal setup.

From 1fef6e601a93508fa938d0b0546a6cd10bd33d52 Mon Sep 17 00:00:00 2001
From: Han Xiao <han.xiao@jina.ai>
Date: Thu, 26 Feb 2026 14:40:15 +0800
Subject: [PATCH 28/28] feat: add jina embeddings v5 support (#156)

* add Jina Embeddings v5 support

* fix ruff format for jina_embedding_function.py
---
 python/zvec/extension/__init__.py             |   4 +
 .../zvec/extension/jina_embedding_function.py | 240 ++++++++++++++++++
 python/zvec/extension/jina_function.py        | 182 +++++++++++++
 3 files changed, 426 insertions(+)
 create mode 100644 python/zvec/extension/jina_embedding_function.py
 create mode 100644 python/zvec/extension/jina_function.py

diff --git a/python/zvec/extension/__init__.py b/python/zvec/extension/__init__.py
index 597f91be..cc9401f8 100644
--- a/python/zvec/extension/__init__.py
+++ b/python/zvec/extension/__init__.py
@@ -15,6 +15,8 @@
 
 from .bm25_embedding_function import BM25EmbeddingFunction
 from .embedding_function import DenseEmbeddingFunction, SparseEmbeddingFunction
+from .jina_embedding_function import JinaDenseEmbedding
+from .jina_function import JinaFunctionBase
 from .multi_vector_reranker import RrfReRanker, WeightedReRanker
 from .openai_embedding_function import OpenAIDenseEmbedding
 from .openai_function import OpenAIFunctionBase
@@ -35,6 +37,8 @@
     "DefaultLocalReRanker",
     "DefaultLocalSparseEmbedding",
     "DenseEmbeddingFunction",
+    "JinaDenseEmbedding",
+    "JinaFunctionBase",
     "OpenAIDenseEmbedding",
     "OpenAIFunctionBase",
     "QwenDenseEmbedding",
diff --git a/python/zvec/extension/jina_embedding_function.py b/python/zvec/extension/jina_embedding_function.py
new file mode 100644
index 00000000..2f8b02aa
--- /dev/null
+++ b/python/zvec/extension/jina_embedding_function.py
@@ -0,0 +1,240 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from functools import lru_cache
+from typing import Optional
+
+from ..common.constants import TEXT, DenseVectorType
+from .embedding_function import DenseEmbeddingFunction
+from .jina_function import JinaFunctionBase
+
+
+class JinaDenseEmbedding(JinaFunctionBase, DenseEmbeddingFunction[TEXT]):
+    """Dense text embedding function using Jina AI API.
+
+    This class provides text-to-vector embedding capabilities using Jina AI's
+    embedding models. It inherits from ``DenseEmbeddingFunction`` and implements
+    dense text embedding via the Jina Embeddings API (OpenAI-compatible).
+
+    Jina Embeddings v5 models support task-specific embedding through the
+    ``task`` parameter, which optimizes the embedding for different use cases
+    such as retrieval, text matching, or classification. They also support
+    Matryoshka Representation Learning, allowing flexible output dimensions.
+
+    Args:
+        model (str, optional): Jina embedding model identifier.
+            Defaults to ``"jina-embeddings-v5-text-nano"``. Available models:
+            - ``"jina-embeddings-v5-text-nano"``: 768 dims, 239M params, 8K context
+            - ``"jina-embeddings-v5-text-small"``: 1024 dims, 677M params, 32K context
+        dimension (Optional[int], optional): Desired output embedding dimension.
+            If ``None``, uses model's default dimension. Supports Matryoshka
+            dimensions: 32, 64, 128, 256, 512, 768 (nano) / 1024 (small).
+            Defaults to ``None``.
+        api_key (Optional[str], optional): Jina API authentication key.
+            If ``None``, reads from ``JINA_API_KEY`` environment variable.
+            Obtain your key from: https://jina.ai/api-dashboard
+        task (Optional[str], optional): Task type to optimize embeddings for.
+            Defaults to ``None``. Valid values:
+            - ``"retrieval.query"``: For search queries
+            - ``"retrieval.passage"``: For documents/passages to be searched
+            - ``"text-matching"``: For symmetric text similarity
+            - ``"classification"``: For text classification
+            - ``"separation"``: For clustering/separation tasks
+
+    Attributes:
+        dimension (int): The embedding vector dimension.
+        data_type (DataType): Always ``DataType.VECTOR_FP32`` for this implementation.
+        model (str): The Jina model name being used.
+        task (Optional[str]): The task type for embedding optimization.
+
+    Raises:
+        ValueError: If API key is not provided and not found in environment,
+            if task is not a valid task type, or if API returns an error response.
+        TypeError: If input to ``embed()`` is not a string.
+        RuntimeError: If network error or Jina service error occurs.
+
+    Note:
+        - Requires Python 3.10, 3.11, or 3.12
+        - Requires the ``openai`` package: ``pip install openai``
+        - Jina API is OpenAI-compatible, so it uses the ``openai`` Python client
+        - Embedding results are cached (LRU cache, maxsize=10) to reduce API calls
+        - For retrieval tasks, use ``"retrieval.query"`` for queries and
+          ``"retrieval.passage"`` for documents
+        - API usage requires a Jina API key from https://jina.ai/api-dashboard
+
+    Examples:
+        >>> # Basic usage with default model
+        >>> from zvec.extension import JinaDenseEmbedding
+        >>> import os
+        >>> os.environ["JINA_API_KEY"] = "jina_..."
+        >>>
+        >>> emb_func = JinaDenseEmbedding()
+        >>> vector = emb_func.embed("Hello, world!")
+        >>> len(vector)
+        768
+
+        >>> # Retrieval use case: embed queries and documents differently
+        >>> query_emb = JinaDenseEmbedding(task="retrieval.query")
+        >>> doc_emb = JinaDenseEmbedding(task="retrieval.passage")
+        >>>
+        >>> query_vector = query_emb.embed("What is machine learning?")
+        >>> doc_vector = doc_emb.embed("Machine learning is a subset of AI...")
+
+        >>> # Using larger model with custom dimension (Matryoshka)
+        >>> emb_func = JinaDenseEmbedding(
+        ...     model="jina-embeddings-v5-text-small",
+        ...     dimension=256,
+        ...     api_key="jina_...",
+        ...     task="text-matching",
+        ... )
+        >>> vector = emb_func.embed("Semantic similarity comparison")
+        >>> len(vector)
+        256
+
+        >>> # Using with zvec collection
+        >>> import zvec
+        >>> emb_func = JinaDenseEmbedding(task="retrieval.passage")
+        >>> schema = zvec.CollectionSchema(
+        ...     name="docs",
+        ...     vectors=zvec.VectorSchema(
+        ...         "embedding", zvec.DataType.VECTOR_FP32, emb_func.dimension
+        ...     ),
+        ... )
+        >>> collection = zvec.create_and_open(path="./my_docs", schema=schema)
+
+    See Also:
+        - ``DenseEmbeddingFunction``: Base class for dense embeddings
+        - ``OpenAIDenseEmbedding``: Alternative using OpenAI API
+        - ``QwenDenseEmbedding``: Alternative using Qwen/DashScope API
+        - ``DefaultLocalDenseEmbedding``: Local model without API calls
+    """
+
+    def __init__(
+        self,
+        model: str = "jina-embeddings-v5-text-nano",
+        dimension: Optional[int] = None,
+        api_key: Optional[str] = None,
+        task: Optional[str] = None,
+        **kwargs,
+    ):
+        """Initialize the Jina dense embedding function.
+
+        Args:
+            model (str): Jina model name. Defaults to "jina-embeddings-v5-text-nano".
+            dimension (Optional[int]): Target embedding dimension or None for default.
+            api_key (Optional[str]): API key or None to use environment variable.
+            task (Optional[str]): Task type for embedding optimization or None.
+            **kwargs: Additional parameters for API calls.
+
+        Raises:
+            ValueError: If API key is not provided and not in environment,
+                or if task is not a valid task type.
+        """
+        # Initialize base class for API connection
+        JinaFunctionBase.__init__(self, model=model, api_key=api_key, task=task)
+
+        # Store dimension configuration
+        self._custom_dimension = dimension
+
+        # Determine actual dimension
+        if dimension is None:
+            self._dimension = self._MODEL_DIMENSIONS.get(model, 768)
+        else:
+            self._dimension = dimension
+
+        # Store extra attributes
+        self._extra_params = kwargs
+
+    @property
+    def dimension(self) -> int:
+        """int: The expected dimensionality of the embedding vector."""
+        return self._dimension
+
+    @property
+    def extra_params(self) -> dict:
+        """dict: Extra parameters for model-specific customization."""
+        return self._extra_params
+
+    def __call__(self, input: TEXT) -> DenseVectorType:
+        """Make the embedding function callable."""
+        return self.embed(input)
+
+    @lru_cache(maxsize=10)
+    def embed(self, input: TEXT) -> DenseVectorType:
+        """Generate dense embedding vector for the input text.
+
+        This method calls the Jina Embeddings API to convert input text
+        into a dense vector representation. Results are cached to improve
+        performance for repeated inputs.
+
+        Args:
+            input (TEXT): Input text string to embed. Must be non-empty after
+                stripping whitespace. Maximum length depends on model:
+                8192 tokens for v5-nano, 32768 tokens for v5-small.
+
+        Returns:
+            DenseVectorType: A list of floats representing the embedding vector.
+                Length equals ``self.dimension``. Example:
+                ``[0.123, -0.456, 0.789, ...]``
+
+        Raises:
+            TypeError: If ``input`` is not a string.
+            ValueError: If input is empty/whitespace-only, or if the API returns
+                an error or malformed response.
+            RuntimeError: If network connectivity issues or Jina service
+                errors occur.
+
+        Examples:
+            >>> emb = JinaDenseEmbedding(task="retrieval.query")
+            >>> vector = emb.embed("What is deep learning?")
+            >>> len(vector)
+            768
+            >>> isinstance(vector[0], float)
+            True
+
+            >>> # Error: empty input
+            >>> emb.embed("   ")
+            ValueError: Input text cannot be empty or whitespace only
+
+            >>> # Error: non-string input
+            >>> emb.embed(123)
+            TypeError: Expected 'input' to be str, got int
+
+        Note:
+            - This method is cached (maxsize=10). Identical inputs return cached results.
+            - The cache is based on exact string match (case-sensitive).
+            - Task type affects embedding optimization but not caching behavior.
+        """
+        if not isinstance(input, TEXT):
+            raise TypeError(f"Expected 'input' to be str, got {type(input).__name__}")
+
+        input = input.strip()
+        if not input:
+            raise ValueError("Input text cannot be empty or whitespace only")
+
+        # Call API
+        embedding_vector = self._call_text_embedding_api(
+            input=input,
+            dimension=self._custom_dimension,
+        )
+
+        # Verify dimension
+        if len(embedding_vector) != self.dimension:
+            raise ValueError(
+                f"Dimension mismatch: expected {self.dimension}, "
+                f"got {len(embedding_vector)}"
+            )
+
+        return embedding_vector
diff --git a/python/zvec/extension/jina_function.py b/python/zvec/extension/jina_function.py
new file mode 100644
index 00000000..f20b679c
--- /dev/null
+++ b/python/zvec/extension/jina_function.py
@@ -0,0 +1,182 @@
+# Copyright 2025-present the zvec project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import os
+from typing import ClassVar, Optional
+
+from ..common.constants import TEXT
+from ..tool import require_module
+
+
+class JinaFunctionBase:
+    """Base class for Jina AI functions.
+
+    This base class provides common functionality for calling Jina AI APIs
+    and handling responses. It supports embeddings (dense) operations via
+    the OpenAI-compatible Jina Embeddings API.
+
+    This class is not meant to be used directly. Use concrete implementations:
+    - ``JinaDenseEmbedding`` for dense embeddings
+
+    Args:
+        model (str): Jina embedding model identifier.
+        api_key (Optional[str]): Jina API authentication key.
+        task (Optional[str]): Task type for the embedding model.
+
+    Note:
+        - This is an internal base class for code reuse across Jina features
+        - Subclasses should inherit from appropriate Protocol
+        - Provides unified API connection and response handling
+        - Jina API is OpenAI-compatible, using the ``openai`` Python client
+    """
+
+    _BASE_URL: ClassVar[str] = "https://api.jina.ai/v1"
+
+    # Model default dimensions
+    _MODEL_DIMENSIONS: ClassVar[dict[str, int]] = {
+        "jina-embeddings-v5-text-nano": 768,
+        "jina-embeddings-v5-text-small": 1024,
+    }
+
+    # Model max tokens
+    _MODEL_MAX_TOKENS: ClassVar[dict[str, int]] = {
+        "jina-embeddings-v5-text-nano": 8192,
+        "jina-embeddings-v5-text-small": 32768,
+    }
+
+    # Valid task types
+    _VALID_TASKS: ClassVar[tuple[str, ...]] = (
+        "retrieval.query",
+        "retrieval.passage",
+        "text-matching",
+        "classification",
+        "separation",
+    )
+
+    def __init__(
+        self,
+        model: str,
+        api_key: Optional[str] = None,
+        task: Optional[str] = None,
+    ):
+        """Initialize the base Jina functionality.
+
+        Args:
+            model (str): Jina model name.
+            api_key (Optional[str]): API key or None to use environment variable.
+            task (Optional[str]): Task type for the embedding model.
+                Valid values: "retrieval.query", "retrieval.passage",
+                "text-matching", "classification", "separation".
+
+        Raises:
+            ValueError: If API key is not provided and not in environment,
+                or if task is not a valid task type.
+        """
+        self._model = model
+        self._api_key = api_key or os.environ.get("JINA_API_KEY")
+        self._task = task
+
+        if not self._api_key:
+            raise ValueError(
+                "Jina API key is required. Please provide 'api_key' parameter "
+                "or set the 'JINA_API_KEY' environment variable. "
+                "Get your key from: https://jina.ai/api-dashboard"
+            )
+
+        if task is not None and task not in self._VALID_TASKS:
+            raise ValueError(
+                f"Invalid task '{task}'. Valid tasks: {', '.join(self._VALID_TASKS)}"
+            )
+
+    @property
+    def model(self) -> str:
+        """str: The Jina model name currently in use."""
+        return self._model
+
+    @property
+    def task(self) -> Optional[str]:
+        """Optional[str]: The task type for the embedding model."""
+        return self._task
+
+    def _get_client(self):
+        """Get OpenAI-compatible client instance configured for Jina API.
+
+        Returns:
+            OpenAI: Configured OpenAI client pointing to Jina API.
+
+        Raises:
+            ImportError: If openai package is not installed.
+        """
+        openai = require_module("openai")
+        return openai.OpenAI(api_key=self._api_key, base_url=self._BASE_URL)
+
+    def _call_text_embedding_api(
+        self,
+        input: TEXT,
+        dimension: Optional[int] = None,
+    ) -> list:
+        """Call Jina Embeddings API.
+
+        Args:
+            input (TEXT): Input text to embed.
+            dimension (Optional[int]): Target dimension for Matryoshka embeddings.
+
+        Returns:
+            list: Embedding vector as list of floats.
+
+        Raises:
+            RuntimeError: If API call fails.
+            ValueError: If API returns error response.
+        """
+        try:
+            client = self._get_client()
+
+            # Prepare embedding parameters
+            params = {"model": self.model, "input": input}
+
+            # Add dimension parameter for Matryoshka support
+            if dimension is not None:
+                params["dimensions"] = dimension
+
+            # Add task parameter via extra_body
+            if self._task is not None:
+                params["extra_body"] = {"task": self._task}
+
+            # Call Jina API (OpenAI-compatible)
+            response = client.embeddings.create(**params)
+
+        except Exception as e:
+            # Check if it's an OpenAI API error
+            openai = require_module("openai")
+            if isinstance(e, (openai.APIError, openai.APIConnectionError)):
+                raise RuntimeError(f"Failed to call Jina API: {e!s}") from e
+            raise RuntimeError(f"Unexpected error during API call: {e!s}") from e
+
+        # Extract embedding from response
+        try:
+            if not response.data:
+                raise ValueError("Invalid API response: no embedding data returned")
+
+            embedding_vector = response.data[0].embedding
+
+            if not isinstance(embedding_vector, list):
+                raise ValueError(
+                    "Invalid API response: embedding is not a list of numbers"
+                )
+
+            return embedding_vector
+
+        except (AttributeError, IndexError, TypeError) as e:
+            raise ValueError(f"Failed to parse API response: {e!s}") from e